mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-09 15:29:16 +00:00
Merge remote-tracking branch 'origin/master' into drm-misc-fixes
Pick up 4.14-rc1 Signed-off-by: Sean Paul <seanpaul@chromium.org>
This commit is contained in:
commit
1ebfc603d0
10
CREDITS
10
CREDITS
@ -2090,7 +2090,7 @@ S: Kuala Lumpur, Malaysia
|
||||
|
||||
N: Mohit Kumar
|
||||
D: ST Microelectronics SPEAr13xx PCI host bridge driver
|
||||
D: Synopsys Designware PCI host bridge driver
|
||||
D: Synopsys DesignWare PCI host bridge driver
|
||||
|
||||
N: Gabor Kuti
|
||||
E: seasons@falcon.sch.bme.hu
|
||||
@ -2606,11 +2606,9 @@ E: tmolina@cablespeed.com
|
||||
D: bug fixes, documentation, minor hackery
|
||||
|
||||
N: Paul Moore
|
||||
E: paul.moore@hp.com
|
||||
D: NetLabel author
|
||||
S: Hewlett-Packard
|
||||
S: 110 Spit Brook Road
|
||||
S: Nashua, NH 03062
|
||||
E: paul@paul-moore.com
|
||||
W: http://www.paul-moore.com
|
||||
D: NetLabel, SELinux, audit
|
||||
|
||||
N: James Morris
|
||||
E: jmorris@namei.org
|
||||
|
19
Documentation/ABI/stable/sysfs-bus-nvmem
Normal file
19
Documentation/ABI/stable/sysfs-bus-nvmem
Normal file
@ -0,0 +1,19 @@
|
||||
What: /sys/bus/nvmem/devices/.../nvmem
|
||||
Date: July 2015
|
||||
KernelVersion: 4.2
|
||||
Contact: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
Description:
|
||||
This file allows user to read/write the raw NVMEM contents.
|
||||
Permissions for write to this file depends on the nvmem
|
||||
provider configuration.
|
||||
|
||||
ex:
|
||||
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
||||
|
||||
0000000 0000 0000 0000 0000 0000 0000 0000 0000
|
||||
*
|
||||
00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00
|
||||
0000000 0000 0000 0000 0000 0000 0000 0000 0000
|
||||
...
|
||||
*
|
||||
0001000
|
30
Documentation/ABI/stable/sysfs-driver-dma-ioatdma
Normal file
30
Documentation/ABI/stable/sysfs-driver-dma-ioatdma
Normal file
@ -0,0 +1,30 @@
|
||||
What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dma<n>chan<n>/quickdata/cap
|
||||
Date: December 3, 2009
|
||||
KernelVersion: 2.6.32
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Capabilities the DMA supports.Currently there are DMA_PQ, DMA_PQ_VAL,
|
||||
DMA_XOR,DMA_XOR_VAL,DMA_INTERRUPT.
|
||||
|
||||
What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dma<n>chan<n>/quickdata/ring_active
|
||||
Date: December 3, 2009
|
||||
KernelVersion: 2.6.32
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: The number of descriptors active in the ring.
|
||||
|
||||
What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dma<n>chan<n>/quickdata/ring_size
|
||||
Date: December 3, 2009
|
||||
KernelVersion: 2.6.32
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Descriptor ring size, total number of descriptors available.
|
||||
|
||||
What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dma<n>chan<n>/quickdata/version
|
||||
Date: December 3, 2009
|
||||
KernelVersion: 2.6.32
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Version of ioatdma device.
|
||||
|
||||
What: sys/devices/pciXXXX:XX/0000:XX:XX.X/dma/dma<n>chan<n>/quickdata/intr_coalesce
|
||||
Date: August 8, 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Tune-able interrupt delay value per channel basis.
|
@ -12,3 +12,6 @@ Description:
|
||||
Ethernet over USB link
|
||||
dev_addr - MAC address of device's end of this
|
||||
Ethernet over USB link
|
||||
class - USB interface class, default is 02 (hex)
|
||||
subclass - USB interface subclass, default is 06 (hex)
|
||||
protocol - USB interface protocol, default is 00 (hex)
|
||||
|
45
Documentation/ABI/testing/ppc-memtrace
Normal file
45
Documentation/ABI/testing/ppc-memtrace
Normal file
@ -0,0 +1,45 @@
|
||||
What: /sys/kernel/debug/powerpc/memtrace
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: This folder contains the relevant debugfs files for the
|
||||
hardware trace macro to use. CONFIG_PPC64_HARDWARE_TRACING
|
||||
must be set.
|
||||
|
||||
What: /sys/kernel/debug/powerpc/memtrace/enable
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: Write an integer containing the size in bytes of the memory
|
||||
you want removed from each NUMA node to this file - it must be
|
||||
aligned to the memblock size. This amount of RAM will be removed
|
||||
from the kernel mappings and the following debugfs files will be
|
||||
created. This can only be successfully done once per boot. Once
|
||||
memory is successfully removed from each node, the following
|
||||
files are created.
|
||||
|
||||
What: /sys/kernel/debug/powerpc/memtrace/<node-id>
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: This directory contains information about the removed memory
|
||||
from the specific NUMA node.
|
||||
|
||||
What: /sys/kernel/debug/powerpc/memtrace/<node-id>/size
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: This contains the size of the memory removed from the node.
|
||||
|
||||
What: /sys/kernel/debug/powerpc/memtrace/<node-id>/start
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: This contains the start address of the removed memory.
|
||||
|
||||
What: /sys/kernel/debug/powerpc/memtrace/<node-id>/trace
|
||||
Date: Aug 2017
|
||||
KernelVersion: 4.14
|
||||
Contact: linuxppc-dev@lists.ozlabs.org
|
||||
Description: This is where the hardware trace macro will output the trace
|
||||
it generates.
|
31
Documentation/ABI/testing/procfs-smaps_rollup
Normal file
31
Documentation/ABI/testing/procfs-smaps_rollup
Normal file
@ -0,0 +1,31 @@
|
||||
What: /proc/pid/smaps_rollup
|
||||
Date: August 2017
|
||||
Contact: Daniel Colascione <dancol@google.com>
|
||||
Description:
|
||||
This file provides pre-summed memory information for a
|
||||
process. The format is identical to /proc/pid/smaps,
|
||||
except instead of an entry for each VMA in a process,
|
||||
smaps_rollup has a single entry (tagged "[rollup]")
|
||||
for which each field is the sum of the corresponding
|
||||
fields from all the maps in /proc/pid/smaps.
|
||||
For more details, see the procfs man page.
|
||||
|
||||
Typical output looks like this:
|
||||
|
||||
00100000-ff709000 ---p 00000000 00:00 0 [rollup]
|
||||
Rss: 884 kB
|
||||
Pss: 385 kB
|
||||
Shared_Clean: 696 kB
|
||||
Shared_Dirty: 0 kB
|
||||
Private_Clean: 120 kB
|
||||
Private_Dirty: 68 kB
|
||||
Referenced: 884 kB
|
||||
Anonymous: 68 kB
|
||||
LazyFree: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
Shared_Hugetlb: 0 kB
|
||||
Private_Hugetlb: 0 kB
|
||||
Swap: 0 kB
|
||||
SwapPss: 0 kB
|
||||
Locked: 385 kB
|
@ -90,3 +90,11 @@ Description:
|
||||
device's debugging info useful for kernel developers. Its
|
||||
format is not documented intentionally and may change
|
||||
anytime without any notice.
|
||||
|
||||
What: /sys/block/zram<id>/backing_dev
|
||||
Date: June 2017
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
@ -119,6 +119,15 @@ Description:
|
||||
unique to allow association with event codes. Units after
|
||||
application of scale and offset are milliamps.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_powerY_raw
|
||||
KernelVersion: 4.5
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Raw (unscaled no bias removal etc.) power measurement from
|
||||
channel Y. The number must always be specified and
|
||||
unique to allow association with event codes. Units after
|
||||
application of scale and offset are milliwatts.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_capacitanceY_raw
|
||||
KernelVersion: 3.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
|
57
Documentation/ABI/testing/sysfs-bus-iio-lptimer-stm32
Normal file
57
Documentation/ABI/testing/sysfs-bus-iio-lptimer-stm32
Normal file
@ -0,0 +1,57 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count0_preset
|
||||
KernelVersion: 4.13
|
||||
Contact: fabrice.gasnier@st.com
|
||||
Description:
|
||||
Reading returns the current preset value. Writing sets the
|
||||
preset value. Encoder counts continuously from 0 to preset
|
||||
value, depending on direction (up/down).
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
|
||||
KernelVersion: 4.13
|
||||
Contact: fabrice.gasnier@st.com
|
||||
Description:
|
||||
Reading returns the list possible quadrature modes.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count0_quadrature_mode
|
||||
KernelVersion: 4.13
|
||||
Contact: fabrice.gasnier@st.com
|
||||
Description:
|
||||
Configure the device counter quadrature modes:
|
||||
- non-quadrature:
|
||||
Encoder IN1 input servers as the count input (up
|
||||
direction).
|
||||
- quadrature:
|
||||
Encoder IN1 and IN2 inputs are mixed to get direction
|
||||
and count.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count_polarity_available
|
||||
KernelVersion: 4.13
|
||||
Contact: fabrice.gasnier@st.com
|
||||
Description:
|
||||
Reading returns the list possible active edges.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_count0_polarity
|
||||
KernelVersion: 4.13
|
||||
Contact: fabrice.gasnier@st.com
|
||||
Description:
|
||||
Configure the device encoder/counter active edge:
|
||||
- rising-edge
|
||||
- falling-edge
|
||||
- both-edges
|
||||
|
||||
In non-quadrature mode, device counts up on active edge.
|
||||
In quadrature mode, encoder counting scenarios are as follows:
|
||||
----------------------------------------------------------------
|
||||
| Active | Level on | IN1 signal | IN2 signal |
|
||||
| edge | opposite |------------------------------------------
|
||||
| | signal | Rising | Falling | Rising | Falling |
|
||||
----------------------------------------------------------------
|
||||
| Rising | High -> | Down | - | Up | - |
|
||||
| edge | Low -> | Up | - | Down | - |
|
||||
----------------------------------------------------------------
|
||||
| Falling | High -> | - | Up | - | Down |
|
||||
| edge | Low -> | - | Down | - | Up |
|
||||
----------------------------------------------------------------
|
||||
| Both | High -> | Down | Up | Up | Down |
|
||||
| edges | Low -> | Up | Down | Down | Up |
|
||||
----------------------------------------------------------------
|
@ -45,6 +45,8 @@ Contact: thunderbolt-software@lists.01.org
|
||||
Description: When a devices supports Thunderbolt secure connect it will
|
||||
have this attribute. Writing 32 byte hex string changes
|
||||
authorization to use the secure connection method instead.
|
||||
Writing an empty string clears the key and regular connection
|
||||
method can be used again.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../device
|
||||
Date: Sep 2017
|
||||
|
@ -45,3 +45,16 @@ Contact: Pratyush Anand <pratyush.anand@gmail.com>
|
||||
Description:
|
||||
Write to this node to issue "U3 exit" for Link Layer
|
||||
Validation device. It is needed for TD.7.36.
|
||||
|
||||
What: /sys/bus/usb/devices/.../enable_compliance
|
||||
Date: July 2017
|
||||
Description:
|
||||
Write to this node to set the port to compliance mode to test
|
||||
with Link Layer Validation device. It is needed for TD.7.34.
|
||||
|
||||
What: /sys/bus/usb/devices/.../warm_reset
|
||||
Date: July 2017
|
||||
Description:
|
||||
Write to this node to issue "Warm Reset" for Link Layer Validation
|
||||
device. It may be needed to properly reset an xHCI 1.1 host port if
|
||||
compliance mode needed to be explicitly enabled.
|
||||
|
8
Documentation/ABI/testing/sysfs-driver-altera-cvp
Normal file
8
Documentation/ABI/testing/sysfs-driver-altera-cvp
Normal file
@ -0,0 +1,8 @@
|
||||
What: /sys/bus/pci/drivers/altera-cvp/chkcfg
|
||||
Date: May 2017
|
||||
Kernel Version: 4.13
|
||||
Contact: Anatolij Gustschin <agust@denx.de>
|
||||
Description:
|
||||
Contains either 1 or 0 and controls if configuration
|
||||
error checking in altera-cvp driver is turned on or
|
||||
off.
|
31
Documentation/ABI/testing/sysfs-firmware-opal-powercap
Normal file
31
Documentation/ABI/testing/sysfs-firmware-opal-powercap
Normal file
@ -0,0 +1,31 @@
|
||||
What: /sys/firmware/opal/powercap
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: Powercap directory for Powernv (P8, P9) servers
|
||||
|
||||
Each folder in this directory contains a
|
||||
power-cappable component.
|
||||
|
||||
What: /sys/firmware/opal/powercap/system-powercap
|
||||
/sys/firmware/opal/powercap/system-powercap/powercap-min
|
||||
/sys/firmware/opal/powercap/system-powercap/powercap-max
|
||||
/sys/firmware/opal/powercap/system-powercap/powercap-current
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: System powercap directory and attributes applicable for
|
||||
Powernv (P8, P9) servers
|
||||
|
||||
This directory provides powercap information. It
|
||||
contains below sysfs attributes:
|
||||
|
||||
- powercap-min : This file provides the minimum
|
||||
possible powercap in Watt units
|
||||
|
||||
- powercap-max : This file provides the maximum
|
||||
possible powercap in Watt units
|
||||
|
||||
- powercap-current : This file provides the current
|
||||
powercap set on the system. Writing to this file
|
||||
creates a request for setting a new-powercap. The
|
||||
powercap requested must be between powercap-min
|
||||
and powercap-max.
|
18
Documentation/ABI/testing/sysfs-firmware-opal-psr
Normal file
18
Documentation/ABI/testing/sysfs-firmware-opal-psr
Normal file
@ -0,0 +1,18 @@
|
||||
What: /sys/firmware/opal/psr
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: Power-Shift-Ratio directory for Powernv P9 servers
|
||||
|
||||
Power-Shift-Ratio allows to provide hints the firmware
|
||||
to shift/throttle power between different entities in
|
||||
the system. Each attribute in this directory indicates
|
||||
a settable PSR.
|
||||
|
||||
What: /sys/firmware/opal/psr/cpu_to_gpu_X
|
||||
Date: August 2017
|
||||
Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
|
||||
Description: PSR sysfs attributes for Powernv P9 servers
|
||||
|
||||
Power-Shift-Ratio between CPU and GPU for a given chip
|
||||
with chip-id X. This file gives the ratio (0-100)
|
||||
which is used by OCC for power-capping.
|
@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the issue rate of small discard commands.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_granularity
|
||||
Date: July 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls discard granularity of inner discard thread, inner thread
|
||||
will not issue discards with size that is smaller than granularity.
|
||||
The unit size is one block, now only support configuring in range
|
||||
of [1, 512].
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
@ -130,3 +139,15 @@ Date: June 2017
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls current reserved blocks in system.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent
|
||||
Date: August 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Do background GC agressively
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
|
||||
Date: August 2017
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls sleep time of GC urgent mode
|
||||
|
26
Documentation/ABI/testing/sysfs-kernel-mm-swap
Normal file
26
Documentation/ABI/testing/sysfs-kernel-mm-swap
Normal file
@ -0,0 +1,26 @@
|
||||
What: /sys/kernel/mm/swap/
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Interface for swapping
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_enabled
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Enable/disable VMA based swap readahead.
|
||||
|
||||
If set to true, the VMA based swap readahead algorithm
|
||||
will be used for swappable anonymous pages mapped in a
|
||||
VMA, and the global swap readahead algorithm will be
|
||||
still used for tmpfs etc. other users. If set to
|
||||
false, the global swap readahead algorithm will be
|
||||
used for all swappable pages.
|
||||
|
||||
What: /sys/kernel/mm/swap/vma_ra_max_order
|
||||
Date: August 2017
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: The max readahead size in order for VMA based swap readahead
|
||||
|
||||
VMA based swap readahead algorithm will readahead at
|
||||
most 1 << max_order pages for each readahead. The
|
||||
real readahead size for each readahead will be scaled
|
||||
according to the estimation algorithm.
|
@ -273,3 +273,15 @@ Description:
|
||||
|
||||
This output is useful for system wakeup diagnostics of spurious
|
||||
wakeup interrupts.
|
||||
|
||||
What: /sys/power/pm_debug_messages
|
||||
Date: July 2017
|
||||
Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
|
||||
Description:
|
||||
The /sys/power/pm_debug_messages file controls the printing
|
||||
of debug messages from the system suspend/hiberbation
|
||||
infrastructure to the kernel log.
|
||||
|
||||
Writing a "1" to this file enables the debug messages and
|
||||
writing a "0" (default) to it disables them. Reads from
|
||||
this file return the current value.
|
||||
|
@ -515,14 +515,15 @@ API at all.
|
||||
::
|
||||
|
||||
void *
|
||||
dma_alloc_noncoherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
|
||||
gfp_t flag, unsigned long attrs)
|
||||
|
||||
Identical to dma_alloc_coherent() except that the platform will
|
||||
choose to return either consistent or non-consistent memory as it sees
|
||||
fit. By using this API, you are guaranteeing to the platform that you
|
||||
have all the correct and necessary sync points for this memory in the
|
||||
driver should it choose to return non-consistent memory.
|
||||
Identical to dma_alloc_coherent() except that when the
|
||||
DMA_ATTR_NON_CONSISTENT flags is passed in the attrs argument, the
|
||||
platform will choose to return either consistent or non-consistent memory
|
||||
as it sees fit. By using this API, you are guaranteeing to the platform
|
||||
that you have all the correct and necessary sync points for this memory
|
||||
in the driver should it choose to return non-consistent memory.
|
||||
|
||||
Note: where the platform can return consistent memory, it will
|
||||
guarantee that the sync points become nops.
|
||||
@ -535,12 +536,13 @@ that simply cannot make consistent memory.
|
||||
::
|
||||
|
||||
void
|
||||
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle)
|
||||
dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle, unsigned long attrs)
|
||||
|
||||
Free memory allocated by the nonconsistent API. All parameters must
|
||||
be identical to those passed in (and returned by
|
||||
dma_alloc_noncoherent()).
|
||||
Free memory allocated by the dma_alloc_attrs(). All parameters common
|
||||
parameters must identical to those otherwise passed to dma_fre_coherent,
|
||||
and the attrs argument must be identical to the attrs passed to
|
||||
dma_alloc_attrs().
|
||||
|
||||
::
|
||||
|
||||
@ -564,8 +566,8 @@ memory or doing partial flushes.
|
||||
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
|
||||
enum dma_data_direction direction)
|
||||
|
||||
Do a partial sync of memory that was allocated by
|
||||
dma_alloc_noncoherent(), starting at virtual address vaddr and
|
||||
Do a partial sync of memory that was allocated by dma_alloc_attrs() with
|
||||
the DMA_ATTR_NON_CONSISTENT flag starting at virtual address vaddr and
|
||||
continuing on for size. Again, you *must* observe the cache line
|
||||
boundaries when doing this.
|
||||
|
||||
@ -590,34 +592,11 @@ size is the size of the area (must be multiples of PAGE_SIZE).
|
||||
|
||||
flags can be ORed together and are:
|
||||
|
||||
- DMA_MEMORY_MAP - request that the memory returned from
|
||||
dma_alloc_coherent() be directly writable.
|
||||
|
||||
- DMA_MEMORY_IO - request that the memory returned from
|
||||
dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
|
||||
|
||||
One or both of these flags must be present.
|
||||
|
||||
- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
|
||||
dma_alloc_coherent of any child devices of this one (for memory residing
|
||||
on a bridge).
|
||||
|
||||
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
|
||||
The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
|
||||
must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
|
||||
if only DMA_MEMORY_MAP were passed in) for success or zero for
|
||||
failure.
|
||||
|
||||
Note, for DMA_MEMORY_IO returns, all subsequent memory returned by
|
||||
dma_alloc_coherent() may no longer be accessed directly, but instead
|
||||
must be accessed using the correct bus functions. If your driver
|
||||
isn't prepared to handle this contingency, it should not specify
|
||||
DMA_MEMORY_IO in the input flags.
|
||||
|
||||
As a simplification for the platforms, only **one** such region of
|
||||
As a simplification for the platforms, only *one* such region of
|
||||
memory may be declared per device.
|
||||
|
||||
For reasons of efficiency, most platforms choose to track the declared
|
||||
|
@ -22,6 +22,8 @@ ifeq ($(HAVE_SPHINX),0)
|
||||
|
||||
.DEFAULT:
|
||||
$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
|
||||
@echo
|
||||
@./scripts/sphinx-pre-install
|
||||
@echo " SKIP Sphinx $@ target."
|
||||
|
||||
else # HAVE_SPHINX
|
||||
@ -95,16 +97,6 @@ endif # HAVE_SPHINX
|
||||
# The following targets are independent of HAVE_SPHINX, and the rules should
|
||||
# work or silently pass without Sphinx.
|
||||
|
||||
# no-ops for the Sphinx toolchain
|
||||
sgmldocs:
|
||||
@:
|
||||
psdocs:
|
||||
@:
|
||||
mandocs:
|
||||
@:
|
||||
installmandocs:
|
||||
@:
|
||||
|
||||
cleandocs:
|
||||
$(Q)rm -rf $(BUILDDIR)
|
||||
$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
|
||||
|
@ -2080,6 +2080,8 @@ Some of the relevant points of interest are as follows:
|
||||
<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
|
||||
<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
|
||||
<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
|
||||
<li> <a href="#Scheduling-Clock Interrupts and RCU">
|
||||
Scheduling-Clock Interrupts and RCU</a>.
|
||||
<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
|
||||
<li> <a href="#Performance, Scalability, Response Time, and Reliability">
|
||||
Performance, Scalability, Response Time, and Reliability</a>.
|
||||
@ -2532,6 +2534,134 @@ I learned of many of these requirements via angry phone calls:
|
||||
Flaming me on the Linux-kernel mailing list was apparently not
|
||||
sufficient to fully vent their ire at RCU's energy-efficiency bugs!
|
||||
|
||||
<h3><a name="Scheduling-Clock Interrupts and RCU">
|
||||
Scheduling-Clock Interrupts and RCU</a></h3>
|
||||
|
||||
<p>
|
||||
The kernel transitions between in-kernel non-idle execution, userspace
|
||||
execution, and the idle loop.
|
||||
Depending on kernel configuration, RCU handles these states differently:
|
||||
|
||||
<table border=3>
|
||||
<tr><th><tt>HZ</tt> Kconfig</th>
|
||||
<th>In-Kernel</th>
|
||||
<th>Usermode</th>
|
||||
<th>Idle</th></tr>
|
||||
<tr><th align="left"><tt>HZ_PERIODIC</tt></th>
|
||||
<td>Can rely on scheduling-clock interrupt.</td>
|
||||
<td>Can rely on scheduling-clock interrupt and its
|
||||
detection of interrupt from usermode.</td>
|
||||
<td>Can rely on RCU's dyntick-idle detection.</td></tr>
|
||||
<tr><th align="left"><tt>NO_HZ_IDLE</tt></th>
|
||||
<td>Can rely on scheduling-clock interrupt.</td>
|
||||
<td>Can rely on scheduling-clock interrupt and its
|
||||
detection of interrupt from usermode.</td>
|
||||
<td>Can rely on RCU's dyntick-idle detection.</td></tr>
|
||||
<tr><th align="left"><tt>NO_HZ_FULL</tt></th>
|
||||
<td>Can only sometimes rely on scheduling-clock interrupt.
|
||||
In other cases, it is necessary to bound kernel execution
|
||||
times and/or use IPIs.</td>
|
||||
<td>Can rely on RCU's dyntick-idle detection.</td>
|
||||
<td>Can rely on RCU's dyntick-idle detection.</td></tr>
|
||||
</table>
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
Why can't <tt>NO_HZ_FULL</tt> in-kernel execution rely on the
|
||||
scheduling-clock interrupt, just like <tt>HZ_PERIODIC</tt>
|
||||
and <tt>NO_HZ_IDLE</tt> do?
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
Because, as a performance optimization, <tt>NO_HZ_FULL</tt>
|
||||
does not necessarily re-enable the scheduling-clock interrupt
|
||||
on entry to each and every system call.
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
However, RCU must be reliably informed as to whether any given
|
||||
CPU is currently in the idle loop, and, for <tt>NO_HZ_FULL</tt>,
|
||||
also whether that CPU is executing in usermode, as discussed
|
||||
<a href="#Energy Efficiency">earlier</a>.
|
||||
It also requires that the scheduling-clock interrupt be enabled when
|
||||
RCU needs it to be:
|
||||
|
||||
<ol>
|
||||
<li> If a CPU is either idle or executing in usermode, and RCU believes
|
||||
it is non-idle, the scheduling-clock tick had better be running.
|
||||
Otherwise, you will get RCU CPU stall warnings. Or at best,
|
||||
very long (11-second) grace periods, with a pointless IPI waking
|
||||
the CPU from time to time.
|
||||
<li> If a CPU is in a portion of the kernel that executes RCU read-side
|
||||
critical sections, and RCU believes this CPU to be idle, you will get
|
||||
random memory corruption. <b>DON'T DO THIS!!!</b>
|
||||
|
||||
<br>This is one reason to test with lockdep, which will complain
|
||||
about this sort of thing.
|
||||
<li> If a CPU is in a portion of the kernel that is absolutely
|
||||
positively no-joking guaranteed to never execute any RCU read-side
|
||||
critical sections, and RCU believes this CPU to to be idle,
|
||||
no problem. This sort of thing is used by some architectures
|
||||
for light-weight exception handlers, which can then avoid the
|
||||
overhead of <tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt>
|
||||
at exception entry and exit, respectively.
|
||||
Some go further and avoid the entireties of <tt>irq_enter()</tt>
|
||||
and <tt>irq_exit()</tt>.
|
||||
|
||||
<br>Just make very sure you are running some of your tests with
|
||||
<tt>CONFIG_PROVE_RCU=y</tt>, just in case one of your code paths
|
||||
was in fact joking about not doing RCU read-side critical sections.
|
||||
<li> If a CPU is executing in the kernel with the scheduling-clock
|
||||
interrupt disabled and RCU believes this CPU to be non-idle,
|
||||
and if the CPU goes idle (from an RCU perspective) every few
|
||||
jiffies, no problem. It is usually OK for there to be the
|
||||
occasional gap between idle periods of up to a second or so.
|
||||
|
||||
<br>If the gap grows too long, you get RCU CPU stall warnings.
|
||||
<li> If a CPU is either idle or executing in usermode, and RCU believes
|
||||
it to be idle, of course no problem.
|
||||
<li> If a CPU is executing in the kernel, the kernel code
|
||||
path is passing through quiescent states at a reasonable
|
||||
frequency (preferably about once per few jiffies, but the
|
||||
occasional excursion to a second or so is usually OK) and the
|
||||
scheduling-clock interrupt is enabled, of course no problem.
|
||||
|
||||
<br>If the gap between a successive pair of quiescent states grows
|
||||
too long, you get RCU CPU stall warnings.
|
||||
</ol>
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
But what if my driver has a hardware interrupt handler
|
||||
that can run for many seconds?
|
||||
I cannot invoke <tt>schedule()</tt> from an hardware
|
||||
interrupt handler, after all!
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
One approach is to do <tt>rcu_irq_exit();rcu_irq_enter();</tt>
|
||||
every so often.
|
||||
But given that long-running interrupt handlers can cause
|
||||
other problems, not least for response time, shouldn't you
|
||||
work to keep your interrupt handler's runtime within reasonable
|
||||
bounds?
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
But as long as RCU is properly informed of kernel state transitions between
|
||||
in-kernel execution, usermode execution, and idle, and as long as the
|
||||
scheduling-clock interrupt is enabled when RCU needs it to be, you
|
||||
can rest assured that the bugs you encounter will be in some other
|
||||
part of RCU or some other part of the kernel!
|
||||
|
||||
<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
|
||||
|
||||
<p>
|
||||
|
@ -23,6 +23,14 @@ over a rather long period of time, but improvements are always welcome!
|
||||
Yet another exception is where the low real-time latency of RCU's
|
||||
read-side primitives is critically important.
|
||||
|
||||
One final exception is where RCU readers are used to prevent
|
||||
the ABA problem (https://en.wikipedia.org/wiki/ABA_problem)
|
||||
for lockless updates. This does result in the mildly
|
||||
counter-intuitive situation where rcu_read_lock() and
|
||||
rcu_read_unlock() are used to protect updates, however, this
|
||||
approach provides the same potential simplifications that garbage
|
||||
collectors do.
|
||||
|
||||
1. Does the update code have proper mutual exclusion?
|
||||
|
||||
RCU does allow -readers- to run (almost) naked, but -writers- must
|
||||
@ -40,7 +48,9 @@ over a rather long period of time, but improvements are always welcome!
|
||||
explain how this single task does not become a major bottleneck on
|
||||
big multiprocessor machines (for example, if the task is updating
|
||||
information relating to itself that other tasks can read, there
|
||||
by definition can be no bottleneck).
|
||||
by definition can be no bottleneck). Note that the definition
|
||||
of "large" has changed significantly: Eight CPUs was "large"
|
||||
in the year 2000, but a hundred CPUs was unremarkable in 2017.
|
||||
|
||||
2. Do the RCU read-side critical sections make proper use of
|
||||
rcu_read_lock() and friends? These primitives are needed
|
||||
@ -55,6 +65,12 @@ over a rather long period of time, but improvements are always welcome!
|
||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
||||
is less readable.
|
||||
|
||||
Letting RCU-protected pointers "leak" out of an RCU read-side
|
||||
critical section is every bid as bad as letting them leak out
|
||||
from under a lock. Unless, of course, you have arranged some
|
||||
other means of protection, such as a lock or a reference count
|
||||
-before- letting them out of the RCU read-side critical section.
|
||||
|
||||
3. Does the update code tolerate concurrent accesses?
|
||||
|
||||
The whole point of RCU is to permit readers to run without
|
||||
@ -78,10 +94,10 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
This works quite well, also.
|
||||
|
||||
c. Make updates appear atomic to readers. For example,
|
||||
c. Make updates appear atomic to readers. For example,
|
||||
pointer updates to properly aligned fields will
|
||||
appear atomic, as will individual atomic primitives.
|
||||
Sequences of perations performed under a lock will -not-
|
||||
Sequences of operations performed under a lock will -not-
|
||||
appear to be atomic to RCU readers, nor will sequences
|
||||
of multiple atomic primitives.
|
||||
|
||||
@ -168,8 +184,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
5. If call_rcu(), or a related primitive such as call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu() is used, the callback function
|
||||
must be written to be called from softirq context. In particular,
|
||||
it cannot block.
|
||||
will be called from softirq context. In particular, it cannot
|
||||
block.
|
||||
|
||||
6. Since synchronize_rcu() can block, it cannot be called from
|
||||
any sort of irq context. The same rule applies for
|
||||
@ -178,11 +194,14 @@ over a rather long period of time, but improvements are always welcome!
|
||||
synchronize_sched_expedite(), and synchronize_srcu_expedited().
|
||||
|
||||
The expedited forms of these primitives have the same semantics
|
||||
as the non-expedited forms, but expediting is both expensive
|
||||
and unfriendly to real-time workloads. Use of the expedited
|
||||
primitives should be restricted to rare configuration-change
|
||||
operations that would not normally be undertaken while a real-time
|
||||
workload is running.
|
||||
as the non-expedited forms, but expediting is both expensive and
|
||||
(with the exception of synchronize_srcu_expedited()) unfriendly
|
||||
to real-time workloads. Use of the expedited primitives should
|
||||
be restricted to rare configuration-change operations that would
|
||||
not normally be undertaken while a real-time workload is running.
|
||||
However, real-time workloads can use rcupdate.rcu_normal kernel
|
||||
boot parameter to completely disable expedited grace periods,
|
||||
though this might have performance implications.
|
||||
|
||||
In particular, if you find yourself invoking one of the expedited
|
||||
primitives repeatedly in a loop, please do everyone a favor:
|
||||
@ -193,11 +212,6 @@ over a rather long period of time, but improvements are always welcome!
|
||||
of the system, especially to real-time workloads running on
|
||||
the rest of the system.
|
||||
|
||||
In addition, it is illegal to call the expedited forms from
|
||||
a CPU-hotplug notifier, or while holding a lock that is acquired
|
||||
by a CPU-hotplug notifier. Failing to observe this restriction
|
||||
will result in deadlock.
|
||||
|
||||
7. If the updater uses call_rcu() or synchronize_rcu(), then the
|
||||
corresponding readers must use rcu_read_lock() and
|
||||
rcu_read_unlock(). If the updater uses call_rcu_bh() or
|
||||
@ -321,7 +335,7 @@ over a rather long period of time, but improvements are always welcome!
|
||||
Similarly, disabling preemption is not an acceptable substitute
|
||||
for rcu_read_lock(). Code that attempts to use preemption
|
||||
disabling where it should be using rcu_read_lock() will break
|
||||
in real-time kernel builds.
|
||||
in CONFIG_PREEMPT=y kernel builds.
|
||||
|
||||
If you want to wait for interrupt handlers, NMI handlers, and
|
||||
code under the influence of preempt_disable(), you instead
|
||||
@ -356,23 +370,22 @@ over a rather long period of time, but improvements are always welcome!
|
||||
not the case, a self-spawning RCU callback would prevent the
|
||||
victim CPU from ever going offline.)
|
||||
|
||||
14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
|
||||
synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu())
|
||||
may only be invoked from process context. Unlike other forms of
|
||||
RCU, it -is- permissible to block in an SRCU read-side critical
|
||||
section (demarked by srcu_read_lock() and srcu_read_unlock()),
|
||||
hence the "SRCU": "sleepable RCU". Please note that if you
|
||||
don't need to sleep in read-side critical sections, you should be
|
||||
using RCU rather than SRCU, because RCU is almost always faster
|
||||
and easier to use than is SRCU.
|
||||
14. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
|
||||
Please note that if you don't need to sleep in read-side critical
|
||||
sections, you should be using RCU rather than SRCU, because RCU
|
||||
is almost always faster and easier to use than is SRCU.
|
||||
|
||||
Also unlike other forms of RCU, explicit initialization
|
||||
and cleanup is required via init_srcu_struct() and
|
||||
cleanup_srcu_struct(). These are passed a "struct srcu_struct"
|
||||
that defines the scope of a given SRCU domain. Once initialized,
|
||||
the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
|
||||
synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
|
||||
A given synchronize_srcu() waits only for SRCU read-side critical
|
||||
Also unlike other forms of RCU, explicit initialization and
|
||||
cleanup is required either at build time via DEFINE_SRCU()
|
||||
or DEFINE_STATIC_SRCU() or at runtime via init_srcu_struct()
|
||||
and cleanup_srcu_struct(). These last two are passed a
|
||||
"struct srcu_struct" that defines the scope of a given
|
||||
SRCU domain. Once initialized, the srcu_struct is passed
|
||||
to srcu_read_lock(), srcu_read_unlock() synchronize_srcu(),
|
||||
synchronize_srcu_expedited(), and call_srcu(). A given
|
||||
synchronize_srcu() waits only for SRCU read-side critical
|
||||
sections governed by srcu_read_lock() and srcu_read_unlock()
|
||||
calls that have been passed the same srcu_struct. This property
|
||||
is what makes sleeping read-side critical sections tolerable --
|
||||
@ -390,10 +403,16 @@ over a rather long period of time, but improvements are always welcome!
|
||||
Therefore, SRCU should be used in preference to rw_semaphore
|
||||
only in extremely read-intensive situations, or in situations
|
||||
requiring SRCU's read-side deadlock immunity or low read-side
|
||||
realtime latency.
|
||||
realtime latency. You should also consider percpu_rw_semaphore
|
||||
when you need lightweight readers.
|
||||
|
||||
Note that, rcu_assign_pointer() relates to SRCU just as it does
|
||||
to other forms of RCU.
|
||||
SRCU's expedited primitive (synchronize_srcu_expedited())
|
||||
never sends IPIs to other CPUs, so it is easier on
|
||||
real-time workloads than is synchronize_rcu_expedited(),
|
||||
synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
|
||||
|
||||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
|
||||
15. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
@ -435,3 +454,33 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
These debugging aids can help you find problems that are
|
||||
otherwise extremely difficult to spot.
|
||||
|
||||
18. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
||||
within a loadable module, then it in necessary to wait for
|
||||
all pending callbacks to be invoked after the last invocation
|
||||
and before unloading that module. Note that it is absolutely
|
||||
-not- sufficient to wait for a grace period! The current (say)
|
||||
synchronize_rcu() implementation waits only for all previous
|
||||
callbacks registered on the CPU that synchronize_rcu() is running
|
||||
on, but it is -not- guaranteed to wait for callbacks registered
|
||||
on other CPUs.
|
||||
|
||||
You instead need to use one of the barrier functions:
|
||||
|
||||
o call_rcu() -> rcu_barrier()
|
||||
o call_rcu_bh() -> rcu_barrier_bh()
|
||||
o call_rcu_sched() -> rcu_barrier_sched()
|
||||
o call_srcu() -> srcu_barrier()
|
||||
|
||||
However, these barrier functions are absolutely -not- guaranteed
|
||||
to wait for a grace period. In fact, if there are no call_rcu()
|
||||
callbacks waiting anywhere in the system, rcu_barrier() is within
|
||||
its rights to return immediately.
|
||||
|
||||
So if you need to wait for both an RCU grace period and for
|
||||
all pre-existing call_rcu() callbacks, you will need to execute
|
||||
both rcu_barrier() and synchronize_rcu(), if necessary, using
|
||||
something like workqueues to to execute them concurrently.
|
||||
|
||||
See rcubarrier.txt for more information.
|
||||
|
@ -76,15 +76,12 @@ o I hear that RCU is patented? What is with that?
|
||||
Of these, one was allowed to lapse by the assignee, and the
|
||||
others have been contributed to the Linux kernel under GPL.
|
||||
There are now also LGPL implementations of user-level RCU
|
||||
available (http://lttng.org/?q=node/18).
|
||||
available (http://liburcu.org/).
|
||||
|
||||
o I hear that RCU needs work in order to support realtime kernels?
|
||||
|
||||
This work is largely completed. Realtime-friendly RCU can be
|
||||
enabled via the CONFIG_PREEMPT_RCU kernel configuration
|
||||
parameter. However, work is in progress for enabling priority
|
||||
boosting of preempted RCU read-side critical sections. This is
|
||||
needed if you have CPU-bound realtime threads.
|
||||
Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
|
||||
kernel configuration parameter.
|
||||
|
||||
o Where can I find more information on RCU?
|
||||
|
||||
|
@ -25,35 +25,35 @@ o You must use one of the rcu_dereference() family of primitives
|
||||
for an example where the compiler can in fact deduce the exact
|
||||
value of the pointer, and thus cause misordering.
|
||||
|
||||
o You are only permitted to use rcu_dereference on pointer values.
|
||||
The compiler simply knows too much about integral values to
|
||||
trust it to carry dependencies through integer operations.
|
||||
There are a very few exceptions, namely that you can temporarily
|
||||
cast the pointer to uintptr_t in order to:
|
||||
|
||||
o Set bits and clear bits down in the must-be-zero low-order
|
||||
bits of that pointer. This clearly means that the pointer
|
||||
must have alignment constraints, for example, this does
|
||||
-not- work in general for char* pointers.
|
||||
|
||||
o XOR bits to translate pointers, as is done in some
|
||||
classic buddy-allocator algorithms.
|
||||
|
||||
It is important to cast the value back to pointer before
|
||||
doing much of anything else with it.
|
||||
|
||||
o Avoid cancellation when using the "+" and "-" infix arithmetic
|
||||
operators. For example, for a given variable "x", avoid
|
||||
"(x-x)". There are similar arithmetic pitfalls from other
|
||||
arithmetic operators, such as "(x*0)", "(x/(x+1))" or "(x%1)".
|
||||
The compiler is within its rights to substitute zero for all of
|
||||
these expressions, so that subsequent accesses no longer depend
|
||||
on the rcu_dereference(), again possibly resulting in bugs due
|
||||
to misordering.
|
||||
"(x-(uintptr_t)x)" for char* pointers. The compiler is within its
|
||||
rights to substitute zero for this sort of expression, so that
|
||||
subsequent accesses no longer depend on the rcu_dereference(),
|
||||
again possibly resulting in bugs due to misordering.
|
||||
|
||||
Of course, if "p" is a pointer from rcu_dereference(), and "a"
|
||||
and "b" are integers that happen to be equal, the expression
|
||||
"p+a-b" is safe because its value still necessarily depends on
|
||||
the rcu_dereference(), thus maintaining proper ordering.
|
||||
|
||||
o Avoid all-zero operands to the bitwise "&" operator, and
|
||||
similarly avoid all-ones operands to the bitwise "|" operator.
|
||||
If the compiler is able to deduce the value of such operands,
|
||||
it is within its rights to substitute the corresponding constant
|
||||
for the bitwise operation. Once again, this causes subsequent
|
||||
accesses to no longer depend on the rcu_dereference(), causing
|
||||
bugs due to misordering.
|
||||
|
||||
Please note that single-bit operands to bitwise "&" can also
|
||||
be dangerous. At this point, the compiler knows that the
|
||||
resulting value can only take on one of two possible values.
|
||||
Therefore, a very small amount of additional information will
|
||||
allow the compiler to deduce the exact value, which again can
|
||||
result in misordering.
|
||||
|
||||
o If you are using RCU to protect JITed functions, so that the
|
||||
"()" function-invocation operator is applied to a value obtained
|
||||
(directly or indirectly) from rcu_dereference(), you may need to
|
||||
@ -61,25 +61,6 @@ o If you are using RCU to protect JITed functions, so that the
|
||||
This issue arises on some systems when a newly JITed function is
|
||||
using the same memory that was used by an earlier JITed function.
|
||||
|
||||
o Do not use the results from the boolean "&&" and "||" when
|
||||
dereferencing. For example, the following (rather improbable)
|
||||
code is buggy:
|
||||
|
||||
int *p;
|
||||
int *q;
|
||||
|
||||
...
|
||||
|
||||
p = rcu_dereference(gp)
|
||||
q = &global_q;
|
||||
q += p != &oom_p1 && p != &oom_p2;
|
||||
r1 = *q; /* BUGGY!!! */
|
||||
|
||||
The reason this is buggy is that "&&" and "||" are often compiled
|
||||
using branches. While weak-memory machines such as ARM or PowerPC
|
||||
do order stores after such branches, they can speculate loads,
|
||||
which can result in misordering bugs.
|
||||
|
||||
o Do not use the results from relational operators ("==", "!=",
|
||||
">", ">=", "<", or "<=") when dereferencing. For example,
|
||||
the following (quite strange) code is buggy:
|
||||
|
@ -263,6 +263,11 @@ Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
|
||||
are delayed for a full grace period? Couldn't this result in
|
||||
rcu_barrier() returning prematurely?
|
||||
|
||||
The current rcu_barrier() implementation is more complex, due to the need
|
||||
to avoid disturbing idle CPUs (especially on battery-powered systems)
|
||||
and the need to minimally disturb non-idle CPUs in real-time systems.
|
||||
However, the code above illustrates the concepts.
|
||||
|
||||
|
||||
rcu_barrier() Summary
|
||||
|
||||
|
@ -276,15 +276,17 @@ o "Free-Block Circulation": Shows the number of torture structures
|
||||
somehow gets incremented farther than it should.
|
||||
|
||||
Different implementations of RCU can provide implementation-specific
|
||||
additional information. For example, SRCU provides the following
|
||||
additional information. For example, Tree SRCU provides the following
|
||||
additional line:
|
||||
|
||||
srcu-torture: per-CPU(idx=1): 0(0,1) 1(0,1) 2(0,0) 3(0,1)
|
||||
srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
|
||||
|
||||
This line shows the per-CPU counter state. The numbers in parentheses are
|
||||
the values of the "old" and "current" counters for the corresponding CPU.
|
||||
The "idx" value maps the "old" and "current" values to the underlying
|
||||
array, and is useful for debugging.
|
||||
This line shows the per-CPU counter state, in this case for Tree SRCU
|
||||
using a dynamically allocated srcu_struct (hence "srcud-" rather than
|
||||
"srcu-"). The numbers in parentheses are the values of the "old" and
|
||||
"current" counters for the corresponding CPU. The "idx" value maps the
|
||||
"old" and "current" values to the underlying array, and is useful for
|
||||
debugging. The final "T" entry contains the totals of the counters.
|
||||
|
||||
|
||||
USAGE
|
||||
@ -304,3 +306,9 @@ checked for such errors. The "rmmod" command forces a "SUCCESS",
|
||||
"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first
|
||||
two are self-explanatory, while the last indicates that while there
|
||||
were no RCU failures, CPU-hotplug problems were detected.
|
||||
|
||||
However, the tools/testing/selftests/rcutorture/bin/kvm.sh script
|
||||
provides better automation, including automatic failure analysis.
|
||||
It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd.
|
||||
See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions
|
||||
on setting up such an initrd.
|
||||
|
@ -890,6 +890,8 @@ SRCU: Critical sections Grace period Barrier
|
||||
srcu_read_lock_held
|
||||
|
||||
SRCU: Initialization/cleanup
|
||||
DEFINE_SRCU
|
||||
DEFINE_STATIC_SRCU
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
@ -913,7 +915,8 @@ a. Will readers need to block? If so, you need SRCU.
|
||||
b. What about the -rt patchset? If readers would need to block
|
||||
in an non-rt kernel, you need SRCU. If readers would block
|
||||
in a -rt kernel, but not in a non-rt kernel, SRCU is not
|
||||
necessary.
|
||||
necessary. (The -rt patchset turns spinlocks into sleeplocks,
|
||||
hence this distinction.)
|
||||
|
||||
c. Do you need to treat NMI handlers, hardirq handlers,
|
||||
and code segments with preemption disabled (whether
|
||||
|
@ -9,8 +9,8 @@ TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
|
||||
|
||||
LiveCD-based tutorials are available at
|
||||
|
||||
http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/
|
||||
http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/
|
||||
http://tomoyo.sourceforge.jp/1.8/ubuntu12.04-live.html
|
||||
http://tomoyo.sourceforge.jp/1.8/centos6-live.html
|
||||
|
||||
Though these tutorials use non-LSM version of TOMOYO, they are useful for you
|
||||
to know what TOMOYO is.
|
||||
@ -21,35 +21,35 @@ How to enable TOMOYO?
|
||||
Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on
|
||||
kernel's command line.
|
||||
|
||||
Please see http://tomoyo.sourceforge.jp/2.3/ for details.
|
||||
Please see http://tomoyo.osdn.jp/2.5/ for details.
|
||||
|
||||
Where is documentation?
|
||||
=======================
|
||||
|
||||
User <-> Kernel interface documentation is available at
|
||||
http://tomoyo.sourceforge.jp/2.3/policy-reference.html .
|
||||
http://tomoyo.osdn.jp/2.5/policy-specification/index.html .
|
||||
|
||||
Materials we prepared for seminars and symposiums are available at
|
||||
http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
|
||||
http://osdn.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
|
||||
Below lists are chosen from three aspects.
|
||||
|
||||
What is TOMOYO?
|
||||
TOMOYO Linux Overview
|
||||
http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/lca2009-takeda.pdf
|
||||
TOMOYO Linux: pragmatic and manageable security for Linux
|
||||
http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
|
||||
TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
|
||||
http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
|
||||
|
||||
What can TOMOYO do?
|
||||
Deep inside TOMOYO Linux
|
||||
http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
|
||||
The role of "pathname based access control" in security.
|
||||
http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/lfj2008-bof.pdf
|
||||
|
||||
History of TOMOYO?
|
||||
Realities of Mainlining
|
||||
http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
|
||||
http://osdn.jp/projects/tomoyo/docs/lfj2008.pdf
|
||||
|
||||
What is future plan?
|
||||
====================
|
||||
@ -60,6 +60,6 @@ multiple LSM modules at the same time. We feel sorry that you have to give up
|
||||
SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
|
||||
|
||||
We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
|
||||
version of TOMOYO, available at http://tomoyo.sourceforge.jp/1.7/ .
|
||||
version of TOMOYO, available at http://tomoyo.osdn.jp/1.8/ .
|
||||
LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
|
||||
to port non-LSM version's functionalities to LSM versions.
|
||||
|
@ -3081,3 +3081,8 @@
|
||||
1 = /dev/osd1 Second OSD Device
|
||||
...
|
||||
255 = /dev/osd255 256th OSD Device
|
||||
|
||||
384-511 char RESERVED FOR DYNAMIC ASSIGNMENT
|
||||
Character devices that request a dynamic allocation of major
|
||||
number will take numbers starting from 511 and downward,
|
||||
once the 234-254 range is full.
|
||||
|
@ -138,6 +138,7 @@ parameter is applicable::
|
||||
PPT Parallel port support is enabled.
|
||||
PS2 Appropriate PS/2 support is enabled.
|
||||
RAM RAM disk support is enabled.
|
||||
RDT Intel Resource Director Technology.
|
||||
S390 S390 architecture is enabled.
|
||||
SCSI Appropriate SCSI support is enabled.
|
||||
A lot of drivers have their options described inside
|
||||
|
@ -2233,6 +2233,17 @@
|
||||
memory contents and reserves bad memory
|
||||
regions that are detected.
|
||||
|
||||
mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
|
||||
Valid arguments: on, off
|
||||
Default (depends on kernel configuration option):
|
||||
on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
|
||||
off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
|
||||
mem_encrypt=on: Activate SME
|
||||
mem_encrypt=off: Do not activate SME
|
||||
|
||||
Refer to Documentation/x86/amd-memory-encryption.txt
|
||||
for details on when memory encryption can be activated.
|
||||
|
||||
mem_sleep_default= [SUSPEND] Default system suspend mode:
|
||||
s2idle - Suspend-To-Idle
|
||||
shallow - Power-On Suspend or equivalent (if supported)
|
||||
@ -2633,9 +2644,10 @@
|
||||
In kernels built with CONFIG_NO_HZ_FULL=y, set
|
||||
the specified list of CPUs whose tick will be stopped
|
||||
whenever possible. The boot CPU will be forced outside
|
||||
the range to maintain the timekeeping.
|
||||
The CPUs in this range must also be included in the
|
||||
rcu_nocbs= set.
|
||||
the range to maintain the timekeeping. Any CPUs
|
||||
in this list will have their RCU callbacks offloaded,
|
||||
just as if they had also been called out in the
|
||||
rcu_nocbs= boot parameter.
|
||||
|
||||
noiotrap [SH] Disables trapped I/O port accesses.
|
||||
|
||||
@ -2696,6 +2708,8 @@
|
||||
nopat [X86] Disable PAT (page attribute table extension of
|
||||
pagetables) support.
|
||||
|
||||
nopcid [X86-64] Disable the PCID cpu feature.
|
||||
|
||||
norandmaps Don't use address space randomization. Equivalent to
|
||||
echo 0 > /proc/sys/kernel/randomize_va_space
|
||||
|
||||
@ -2750,6 +2764,15 @@
|
||||
If the dependencies are under your control, you can
|
||||
turn on cpu0_hotplug.
|
||||
|
||||
nps_mtm_hs_ctr= [KNL,ARC]
|
||||
This parameter sets the maximum duration, in
|
||||
cycles, each HW thread of the CTOP can run
|
||||
without interruptions, before HW switches it.
|
||||
The actual maximum duration is 16 times this
|
||||
parameter's value.
|
||||
Format: integer between 1 and 255
|
||||
Default: 255
|
||||
|
||||
nptcg= [IA-64] Override max number of concurrent global TLB
|
||||
purges which is reported from either PAL_VM_SUMMARY or
|
||||
SAL PALO.
|
||||
@ -2769,7 +2792,7 @@
|
||||
Allowed values are enable and disable
|
||||
|
||||
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
|
||||
one of ['zone', 'node', 'default'] can be specified
|
||||
'node', 'default' can be specified
|
||||
This can be set from sysctl after boot.
|
||||
See Documentation/sysctl/vm.txt for details.
|
||||
|
||||
@ -3598,6 +3621,12 @@
|
||||
Run specified binary instead of /init from the ramdisk,
|
||||
used for early userspace startup. See initrd.
|
||||
|
||||
rdt= [HW,X86,RDT]
|
||||
Turn on/off individual RDT features. List is:
|
||||
cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
|
||||
E.g. to turn on cmt and turn off mba use:
|
||||
rdt=cmt,!mba
|
||||
|
||||
reboot= [KNL]
|
||||
Format (x86 or x86_64):
|
||||
[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
|
||||
@ -4375,6 +4404,10 @@
|
||||
decrease the size and leave more room for directly
|
||||
mapped kernel RAM.
|
||||
|
||||
vmcp_cma=nn[MG] [KNL,S390]
|
||||
Sets the memory size reserved for contiguous memory
|
||||
allocations for the vmcp device driver.
|
||||
|
||||
vmhalt= [KNL,S390] Perform z/VM CP command after system halt.
|
||||
Format: <command>
|
||||
|
||||
|
@ -479,14 +479,6 @@ This governor exposes the following tunables:
|
||||
|
||||
# echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
|
||||
|
||||
|
||||
``min_sampling_rate``
|
||||
The minimum value of ``sampling_rate``.
|
||||
|
||||
Equal to 10000 (10 ms) if :c:macro:`CONFIG_NO_HZ_COMMON` and
|
||||
:c:data:`tick_nohz_active` are both set or to 20 times the value of
|
||||
:c:data:`jiffies` in microseconds otherwise.
|
||||
|
||||
``up_threshold``
|
||||
If the estimated CPU load is above this value (in percent), the governor
|
||||
will set the frequency to the maximum value allowed for the policy.
|
||||
|
@ -5,12 +5,6 @@ Power Management
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
cpufreq
|
||||
intel_pstate
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
Indices
|
||||
=======
|
||||
|
||||
* :ref:`genindex`
|
||||
strategies
|
||||
system-wide
|
||||
working-state
|
||||
|
@ -167,35 +167,17 @@ is set.
|
||||
``powersave``
|
||||
.............
|
||||
|
||||
Without HWP, this P-state selection algorithm generally depends on the
|
||||
processor model and/or the system profile setting in the ACPI tables and there
|
||||
are two variants of it.
|
||||
|
||||
One of them is used with processors from the Atom line and (regardless of the
|
||||
processor model) on platforms with the system profile in the ACPI tables set to
|
||||
"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
|
||||
"workstation". It is also used with processors supporting the HWP feature if
|
||||
that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
|
||||
argument in the kernel command line). It is similar to the algorithm
|
||||
Without HWP, this P-state selection algorithm is similar to the algorithm
|
||||
implemented by the generic ``schedutil`` scaling governor except that the
|
||||
utilization metric used by it is based on numbers coming from feedback
|
||||
registers of the CPU. It generally selects P-states proportional to the
|
||||
current CPU utilization, so it is referred to as the "proportional" algorithm.
|
||||
current CPU utilization.
|
||||
|
||||
The second variant of the ``powersave`` P-state selection algorithm, used in all
|
||||
of the other cases (generally, on processors from the Core line, so it is
|
||||
referred to as the "Core" algorithm), is based on the values read from the APERF
|
||||
and MPERF feedback registers and the previously requested target P-state.
|
||||
It does not really take CPU utilization into account explicitly, but as a rule
|
||||
it causes the CPU P-state to ramp up very quickly in response to increased
|
||||
utilization which is generally desirable in server environments.
|
||||
|
||||
Regardless of the variant, this algorithm is run by the driver's utilization
|
||||
update callback for the given CPU when it is invoked by the CPU scheduler, but
|
||||
not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
|
||||
particular case <Tuning Interface in debugfs_>`_). Like in the ``performance``
|
||||
case, the hardware configuration is not touched if the new P-state turns out to
|
||||
be the same as the current one.
|
||||
This algorithm is run by the driver's utilization update callback for the
|
||||
given CPU when it is invoked by the CPU scheduler, but not more often than
|
||||
every 10 ms. Like in the ``performance`` case, the hardware configuration
|
||||
is not touched if the new P-state turns out to be the same as the current
|
||||
one.
|
||||
|
||||
This is the default P-state selection algorithm if the
|
||||
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
|
||||
@ -720,34 +702,7 @@ P-state is called, the ``ftrace`` filter can be set to to
|
||||
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||
|
||||
Tuning Interface in ``debugfs``
|
||||
-------------------------------
|
||||
|
||||
The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
|
||||
processors in the active mode <powersave_>`_ is based on a `PID controller`_
|
||||
whose parameters were chosen to address a number of different use cases at the
|
||||
same time. However, it still is possible to fine-tune it to a specific workload
|
||||
and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
|
||||
provided for this purpose. [Note that the ``pstate_snb`` directory will be
|
||||
present only if the specific P-state selection algorithm matching the interface
|
||||
in it actually is in use.]
|
||||
|
||||
The following files present in that directory can be used to modify the PID
|
||||
controller parameters at run time:
|
||||
|
||||
| ``deadband``
|
||||
| ``d_gain_pct``
|
||||
| ``i_gain_pct``
|
||||
| ``p_gain_pct``
|
||||
| ``sample_rate_ms``
|
||||
| ``setpoint``
|
||||
|
||||
Note, however, that achieving desirable results this way generally requires
|
||||
expert-level understanding of the power vs performance tradeoff, so extra care
|
||||
is recommended when attempting to do that.
|
||||
|
||||
|
||||
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
||||
.. _PID controller: https://en.wikipedia.org/wiki/PID_controller
|
||||
|
245
Documentation/admin-guide/pm/sleep-states.rst
Normal file
245
Documentation/admin-guide/pm/sleep-states.rst
Normal file
@ -0,0 +1,245 @@
|
||||
===================
|
||||
System Sleep States
|
||||
===================
|
||||
|
||||
::
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Sleep states are global low-power states of the entire system in which user
|
||||
space code cannot be executed and the overall system activity is significantly
|
||||
reduced.
|
||||
|
||||
|
||||
Sleep States That Can Be Supported
|
||||
==================================
|
||||
|
||||
Depending on its configuration and the capabilities of the platform it runs on,
|
||||
the Linux kernel can support up to four system sleep states, includig
|
||||
hibernation and up to three variants of system suspend. The sleep states that
|
||||
can be supported by the kernel are listed below.
|
||||
|
||||
.. _s2idle:
|
||||
|
||||
Suspend-to-Idle
|
||||
---------------
|
||||
|
||||
This is a generic, pure software, light-weight variant of system suspend (also
|
||||
referred to as S2I or S2Idle). It allows more energy to be saved relative to
|
||||
runtime idle by freezing user space, suspending the timekeeping and putting all
|
||||
I/O devices into low-power states (possibly lower-power than available in the
|
||||
working state), such that the processors can spend time in their deepest idle
|
||||
states while the system is suspended.
|
||||
|
||||
The system is woken up from this state by in-band interrupts, so theoretically
|
||||
any devices that can cause interrupts to be generated in the working state can
|
||||
also be set up as wakeup devices for S2Idle.
|
||||
|
||||
This state can be used on platforms without support for :ref:`standby <standby>`
|
||||
or :ref:`suspend-to-RAM <s2ram>`, or it can be used in addition to any of the
|
||||
deeper system suspend variants to provide reduced resume latency. It is always
|
||||
supported if the :c:macro:`CONFIG_SUSPEND` kernel configuration option is set.
|
||||
|
||||
.. _standby:
|
||||
|
||||
Standby
|
||||
-------
|
||||
|
||||
This state, if supported, offers moderate, but real, energy savings, while
|
||||
providing a relatively straightforward transition back to the working state. No
|
||||
operating state is lost (the system core logic retains power), so the system can
|
||||
go back to where it left off easily enough.
|
||||
|
||||
In addition to freezing user space, suspending the timekeeping and putting all
|
||||
I/O devices into low-power states, which is done for :ref:`suspend-to-idle
|
||||
<s2idle>` too, nonboot CPUs are taken offline and all low-level system functions
|
||||
are suspended during transitions into this state. For this reason, it should
|
||||
allow more energy to be saved relative to :ref:`suspend-to-idle <s2idle>`, but
|
||||
the resume latency will generally be greater than for that state.
|
||||
|
||||
The set of devices that can wake up the system from this state usually is
|
||||
reduced relative to :ref:`suspend-to-idle <s2idle>` and it may be necessary to
|
||||
rely on the platform for setting up the wakeup functionality as appropriate.
|
||||
|
||||
This state is supported if the :c:macro:`CONFIG_SUSPEND` kernel configuration
|
||||
option is set and the support for it is registered by the platform with the
|
||||
core system suspend subsystem. On ACPI-based systems this state is mapped to
|
||||
the S1 system state defined by ACPI.
|
||||
|
||||
.. _s2ram:
|
||||
|
||||
Suspend-to-RAM
|
||||
--------------
|
||||
|
||||
This state (also referred to as STR or S2RAM), if supported, offers significant
|
||||
energy savings as everything in the system is put into a low-power state, except
|
||||
for memory, which should be placed into the self-refresh mode to retain its
|
||||
contents. All of the steps carried out when entering :ref:`standby <standby>`
|
||||
are also carried out during transitions to S2RAM. Additional operations may
|
||||
take place depending on the platform capabilities. In particular, on ACPI-based
|
||||
systems the kernel passes control to the platform firmware (BIOS) as the last
|
||||
step during S2RAM transitions and that usually results in powering down some
|
||||
more low-level components that are not directly controlled by the kernel.
|
||||
|
||||
The state of devices and CPUs is saved and held in memory. All devices are
|
||||
suspended and put into low-power states. In many cases, all peripheral buses
|
||||
lose power when entering S2RAM, so devices must be able to handle the transition
|
||||
back to the "on" state.
|
||||
|
||||
On ACPI-based systems S2RAM requires some minimal boot-strapping code in the
|
||||
platform firmware to resume the system from it. This may be the case on other
|
||||
platforms too.
|
||||
|
||||
The set of devices that can wake up the system from S2RAM usually is reduced
|
||||
relative to :ref:`suspend-to-idle <s2idle>` and :ref:`standby <standby>` and it
|
||||
may be necessary to rely on the platform for setting up the wakeup functionality
|
||||
as appropriate.
|
||||
|
||||
S2RAM is supported if the :c:macro:`CONFIG_SUSPEND` kernel configuration option
|
||||
is set and the support for it is registered by the platform with the core system
|
||||
suspend subsystem. On ACPI-based systems it is mapped to the S3 system state
|
||||
defined by ACPI.
|
||||
|
||||
.. _hibernation:
|
||||
|
||||
Hibernation
|
||||
-----------
|
||||
|
||||
This state (also referred to as Suspend-to-Disk or STD) offers the greatest
|
||||
energy savings and can be used even in the absence of low-level platform support
|
||||
for system suspend. However, it requires some low-level code for resuming the
|
||||
system to be present for the underlying CPU architecture.
|
||||
|
||||
Hibernation is significantly different from any of the system suspend variants.
|
||||
It takes three system state changes to put it into hibernation and two system
|
||||
state changes to resume it.
|
||||
|
||||
First, when hibernation is triggered, the kernel stops all system activity and
|
||||
creates a snapshot image of memory to be written into persistent storage. Next,
|
||||
the system goes into a state in which the snapshot image can be saved, the image
|
||||
is written out and finally the system goes into the target low-power state in
|
||||
which power is cut from almost all of its hardware components, including memory,
|
||||
except for a limited set of wakeup devices.
|
||||
|
||||
Once the snapshot image has been written out, the system may either enter a
|
||||
special low-power state (like ACPI S4), or it may simply power down itself.
|
||||
Powering down means minimum power draw and it allows this mechanism to work on
|
||||
any system. However, entering a special low-power state may allow additional
|
||||
means of system wakeup to be used (e.g. pressing a key on the keyboard or
|
||||
opening a laptop lid).
|
||||
|
||||
After wakeup, control goes to the platform firmware that runs a boot loader
|
||||
which boots a fresh instance of the kernel (control may also go directly to
|
||||
the boot loader, depending on the system configuration, but anyway it causes
|
||||
a fresh instance of the kernel to be booted). That new instance of the kernel
|
||||
(referred to as the ``restore kernel``) looks for a hibernation image in
|
||||
persistent storage and if one is found, it is loaded into memory. Next, all
|
||||
activity in the system is stopped and the restore kernel overwrites itself with
|
||||
the image contents and jumps into a special trampoline area in the original
|
||||
kernel stored in the image (referred to as the ``image kernel``), which is where
|
||||
the special architecture-specific low-level code is needed. Finally, the
|
||||
image kernel restores the system to the pre-hibernation state and allows user
|
||||
space to run again.
|
||||
|
||||
Hibernation is supported if the :c:macro:`CONFIG_HIBERNATION` kernel
|
||||
configuration option is set. However, this option can only be set if support
|
||||
for the given CPU architecture includes the low-level code for system resume.
|
||||
|
||||
|
||||
Basic ``sysfs`` Interfaces for System Suspend and Hibernation
|
||||
=============================================================
|
||||
|
||||
The following files located in the :file:`/sys/power/` directory can be used by
|
||||
user space for sleep states control.
|
||||
|
||||
``state``
|
||||
This file contains a list of strings representing sleep states supported
|
||||
by the kernel. Writing one of these strings into it causes the kernel
|
||||
to start a transition of the system into the sleep state represented by
|
||||
that string.
|
||||
|
||||
In particular, the strings "disk", "freeze" and "standby" represent the
|
||||
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
|
||||
:ref:`standby <standby>` sleep states, respectively. The string "mem"
|
||||
is interpreted in accordance with the contents of the ``mem_sleep`` file
|
||||
described below.
|
||||
|
||||
If the kernel does not support any system sleep states, this file is
|
||||
not present.
|
||||
|
||||
``mem_sleep``
|
||||
This file contains a list of strings representing supported system
|
||||
suspend variants and allows user space to select the variant to be
|
||||
associated with the "mem" string in the ``state`` file described above.
|
||||
|
||||
The strings that may be present in this file are "s2idle", "shallow"
|
||||
and "deep". The string "s2idle" always represents :ref:`suspend-to-idle
|
||||
<s2idle>` and, by convention, "shallow" and "deep" represent
|
||||
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
|
||||
respectively.
|
||||
|
||||
Writing one of the listed strings into this file causes the system
|
||||
suspend variant represented by it to be associated with the "mem" string
|
||||
in the ``state`` file. The string representing the suspend variant
|
||||
currently associated with the "mem" string in the ``state`` file
|
||||
is listed in square brackets.
|
||||
|
||||
If the kernel does not support system suspend, this file is not present.
|
||||
|
||||
``disk``
|
||||
This file contains a list of strings representing different operations
|
||||
that can be carried out after the hibernation image has been saved. The
|
||||
possible options are as follows:
|
||||
|
||||
``platform``
|
||||
Put the system into a special low-power state (e.g. ACPI S4) to
|
||||
make additional wakeup options available and possibly allow the
|
||||
platform firmware to take a simplified initialization path after
|
||||
wakeup.
|
||||
|
||||
``shutdown``
|
||||
Power off the system.
|
||||
|
||||
``reboot``
|
||||
Reboot the system (useful for diagnostics mostly).
|
||||
|
||||
``suspend``
|
||||
Hybrid system suspend. Put the system into the suspend sleep
|
||||
state selected through the ``mem_sleep`` file described above.
|
||||
If the system is successfully woken up from that state, discard
|
||||
the hibernation image and continue. Otherwise, use the image
|
||||
to restore the previous state of the system.
|
||||
|
||||
``test_resume``
|
||||
Diagnostic operation. Load the image as though the system had
|
||||
just woken up from hibernation and the currently running kernel
|
||||
instance was a restore kernel and follow up with full system
|
||||
resume.
|
||||
|
||||
Writing one of the listed strings into this file causes the option
|
||||
represented by it to be selected.
|
||||
|
||||
The currently selected option is shown in square brackets which means
|
||||
that the operation represented by it will be carried out after creating
|
||||
and saving the image next time hibernation is triggered by writing
|
||||
``disk`` to :file:`/sys/power/state`.
|
||||
|
||||
If the kernel does not support hibernation, this file is not present.
|
||||
|
||||
According to the above, there are two ways to make the system go into the
|
||||
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
|
||||
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
|
||||
:file:`/sys/power/mem_sleep` and then to write "mem" to
|
||||
:file:`/sys/power/state`. Likewise, there are two ways to make the system go
|
||||
into the :ref:`standby <standby>` state (the strings to write to the control
|
||||
files in that case are "standby" or "shallow" and "mem", respectively) if that
|
||||
state is supported by the platform. However, there is only one way to make the
|
||||
system go into the :ref:`suspend-to-RAM <s2ram>` state (write "deep" into
|
||||
:file:`/sys/power/mem_sleep` and "mem" into :file:`/sys/power/state`).
|
||||
|
||||
The default suspend variant (ie. the one to be used without writing anything
|
||||
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
|
||||
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
|
||||
by the value of the "mem_sleep_default" parameter in the kernel command line.
|
||||
On some ACPI-based systems, depending on the information in the ACPI tables, the
|
||||
default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported.
|
52
Documentation/admin-guide/pm/strategies.rst
Normal file
52
Documentation/admin-guide/pm/strategies.rst
Normal file
@ -0,0 +1,52 @@
|
||||
===========================
|
||||
Power Management Strategies
|
||||
===========================
|
||||
|
||||
::
|
||||
|
||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
The Linux kernel supports two major high-level power management strategies.
|
||||
|
||||
One of them is based on using global low-power states of the whole system in
|
||||
which user space code cannot be executed and the overall system activity is
|
||||
significantly reduced, referred to as :doc:`sleep states <sleep-states>`. The
|
||||
kernel puts the system into one of these states when requested by user space
|
||||
and the system stays in it until a special signal is received from one of
|
||||
designated devices, triggering a transition to the ``working state`` in which
|
||||
user space code can run. Because sleep states are global and the whole system
|
||||
is affected by the state changes, this strategy is referred to as the
|
||||
:doc:`system-wide power management <system-wide>`.
|
||||
|
||||
The other strategy, referred to as the :doc:`working-state power management
|
||||
<working-state>`, is based on adjusting the power states of individual hardware
|
||||
components of the system, as needed, in the working state. In consequence, if
|
||||
this strategy is in use, the working state of the system usually does not
|
||||
correspond to any particular physical configuration of it, but can be treated as
|
||||
a metastate covering a range of different power states of the system in which
|
||||
the individual components of it can be either ``active`` (in use) or
|
||||
``inactive`` (idle). If they are active, they have to be in power states
|
||||
allowing them to process data and to be accessed by software. In turn, if they
|
||||
are inactive, ideally, they should be in low-power states in which they may not
|
||||
be accessible.
|
||||
|
||||
If all of the system components are active, the system as a whole is regarded as
|
||||
"runtime active" and that situation typically corresponds to the maximum power
|
||||
draw (or maximum energy usage) of it. If all of them are inactive, the system
|
||||
as a whole is regarded as "runtime idle" which may be very close to a sleep
|
||||
state from the physical system configuration and power draw perspective, but
|
||||
then it takes much less time and effort to start executing user space code than
|
||||
for the same system in a sleep state. However, transitions from sleep states
|
||||
back to the working state can only be started by a limited set of devices, so
|
||||
typically the system can spend much more time in a sleep state than it can be
|
||||
runtime idle in one go. For this reason, systems usually use less energy in
|
||||
sleep states than when they are runtime idle most of the time.
|
||||
|
||||
Moreover, the two power management strategies address different usage scenarios.
|
||||
Namely, if the user indicates that the system will not be in use going forward,
|
||||
for example by closing its lid (if the system is a laptop), it probably should
|
||||
go into a sleep state at that point. On the other hand, if the user simply goes
|
||||
away from the laptop keyboard, it probably should stay in the working state and
|
||||
use the working-state power management in case it becomes idle, because the user
|
||||
may come back to it at any time and then may want the system to be immediately
|
||||
accessible.
|
8
Documentation/admin-guide/pm/system-wide.rst
Normal file
8
Documentation/admin-guide/pm/system-wide.rst
Normal file
@ -0,0 +1,8 @@
|
||||
============================
|
||||
System-Wide Power Management
|
||||
============================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
sleep-states
|
9
Documentation/admin-guide/pm/working-state.rst
Normal file
9
Documentation/admin-guide/pm/working-state.rst
Normal file
@ -0,0 +1,9 @@
|
||||
==============================
|
||||
Working-State Power Management
|
||||
==============================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
cpufreq
|
||||
intel_pstate
|
@ -60,7 +60,7 @@ Example of using a firmware operation:
|
||||
|
||||
/* some platform code, e.g. SMP initialization */
|
||||
|
||||
__raw_writel(virt_to_phys(exynos4_secondary_startup),
|
||||
__raw_writel(__pa_symbol(exynos4_secondary_startup),
|
||||
CPU1_BOOT_REG);
|
||||
|
||||
/* Call Exynos specific smc call */
|
||||
|
@ -179,6 +179,8 @@ infrastructure:
|
||||
| FCMA | [19-16] | y |
|
||||
|--------------------------------------------------|
|
||||
| JSCVT | [15-12] | y |
|
||||
|--------------------------------------------------|
|
||||
| DPB | [3-0] | y |
|
||||
x--------------------------------------------------x
|
||||
|
||||
Appendix I: Example
|
||||
|
66
Documentation/atomic_bitops.txt
Normal file
66
Documentation/atomic_bitops.txt
Normal file
@ -0,0 +1,66 @@
|
||||
|
||||
On atomic bitops.
|
||||
|
||||
|
||||
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
||||
operating on single bits in a bitmap that are atomic.
|
||||
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
The single bit operations are:
|
||||
|
||||
Non-RMW ops:
|
||||
|
||||
test_bit()
|
||||
|
||||
RMW atomic operations without return value:
|
||||
|
||||
{set,clear,change}_bit()
|
||||
clear_bit_unlock()
|
||||
|
||||
RMW atomic operations with return value:
|
||||
|
||||
test_and_{set,clear,change}_bit()
|
||||
test_and_set_bit_lock()
|
||||
|
||||
Barriers:
|
||||
|
||||
smp_mb__{before,after}_atomic()
|
||||
|
||||
|
||||
All RMW atomic operations have a '__' prefixed variant which is non-atomic.
|
||||
|
||||
|
||||
SEMANTICS
|
||||
---------
|
||||
|
||||
Non-atomic ops:
|
||||
|
||||
In particular __clear_bit_unlock() suffers the same issue as atomic_set(),
|
||||
which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt.
|
||||
|
||||
|
||||
RMW ops:
|
||||
|
||||
The test_and_{}_bit() operations return the original value of the bit.
|
||||
|
||||
|
||||
ORDERING
|
||||
--------
|
||||
|
||||
Like with atomic_t, the rule of thumb is:
|
||||
|
||||
- non-RMW operations are unordered;
|
||||
|
||||
- RMW operations that have no return value are unordered;
|
||||
|
||||
- RMW operations that have a return value are fully ordered.
|
||||
|
||||
Except for test_and_set_bit_lock() which has ACQUIRE semantics and
|
||||
clear_bit_unlock() which has RELEASE semantics.
|
||||
|
||||
Since a platform only has a single means of achieving atomic operations
|
||||
the same barriers as for atomic_t are used, see atomic_t.txt.
|
||||
|
242
Documentation/atomic_t.txt
Normal file
242
Documentation/atomic_t.txt
Normal file
@ -0,0 +1,242 @@
|
||||
|
||||
On atomic types (atomic_t atomic64_t and atomic_long_t).
|
||||
|
||||
The atomic type provides an interface to the architecture's means of atomic
|
||||
RMW operations between CPUs (atomic operations on MMIO are not supported and
|
||||
can lead to fatal traps on some platforms).
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for
|
||||
brevity):
|
||||
|
||||
Non-RMW ops:
|
||||
|
||||
atomic_read(), atomic_set()
|
||||
atomic_read_acquire(), atomic_set_release()
|
||||
|
||||
|
||||
RMW atomic operations:
|
||||
|
||||
Arithmetic:
|
||||
|
||||
atomic_{add,sub,inc,dec}()
|
||||
atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}()
|
||||
atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}()
|
||||
|
||||
|
||||
Bitwise:
|
||||
|
||||
atomic_{and,or,xor,andnot}()
|
||||
atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}()
|
||||
|
||||
|
||||
Swap:
|
||||
|
||||
atomic_xchg{,_relaxed,_acquire,_release}()
|
||||
atomic_cmpxchg{,_relaxed,_acquire,_release}()
|
||||
atomic_try_cmpxchg{,_relaxed,_acquire,_release}()
|
||||
|
||||
|
||||
Reference count (but please see refcount_t):
|
||||
|
||||
atomic_add_unless(), atomic_inc_not_zero()
|
||||
atomic_sub_and_test(), atomic_dec_and_test()
|
||||
|
||||
|
||||
Misc:
|
||||
|
||||
atomic_inc_and_test(), atomic_add_negative()
|
||||
atomic_dec_unless_positive(), atomic_inc_unless_negative()
|
||||
|
||||
|
||||
Barriers:
|
||||
|
||||
smp_mb__{before,after}_atomic()
|
||||
|
||||
|
||||
|
||||
SEMANTICS
|
||||
---------
|
||||
|
||||
Non-RMW ops:
|
||||
|
||||
The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
|
||||
implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
|
||||
smp_store_release() respectively.
|
||||
|
||||
The one detail to this is that atomic_set{}() should be observable to the RMW
|
||||
ops. That is:
|
||||
|
||||
C atomic-set
|
||||
|
||||
{
|
||||
atomic_set(v, 1);
|
||||
}
|
||||
|
||||
P1(atomic_t *v)
|
||||
{
|
||||
atomic_add_unless(v, 1, 0);
|
||||
}
|
||||
|
||||
P2(atomic_t *v)
|
||||
{
|
||||
atomic_set(v, 0);
|
||||
}
|
||||
|
||||
exists
|
||||
(v=2)
|
||||
|
||||
In this case we would expect the atomic_set() from CPU1 to either happen
|
||||
before the atomic_add_unless(), in which case that latter one would no-op, or
|
||||
_after_ in which case we'd overwrite its result. In no case is "2" a valid
|
||||
outcome.
|
||||
|
||||
This is typically true on 'normal' platforms, where a regular competing STORE
|
||||
will invalidate a LL/SC or fail a CMPXCHG.
|
||||
|
||||
The obvious case where this is not so is when we need to implement atomic ops
|
||||
with a lock:
|
||||
|
||||
CPU0 CPU1
|
||||
|
||||
atomic_add_unless(v, 1, 0);
|
||||
lock();
|
||||
ret = READ_ONCE(v->counter); // == 1
|
||||
atomic_set(v, 0);
|
||||
if (ret != u) WRITE_ONCE(v->counter, 0);
|
||||
WRITE_ONCE(v->counter, ret + 1);
|
||||
unlock();
|
||||
|
||||
the typical solution is to then implement atomic_set{}() with atomic_xchg().
|
||||
|
||||
|
||||
RMW ops:
|
||||
|
||||
These come in various forms:
|
||||
|
||||
- plain operations without return value: atomic_{}()
|
||||
|
||||
- operations which return the modified value: atomic_{}_return()
|
||||
|
||||
these are limited to the arithmetic operations because those are
|
||||
reversible. Bitops are irreversible and therefore the modified value
|
||||
is of dubious utility.
|
||||
|
||||
- operations which return the original value: atomic_fetch_{}()
|
||||
|
||||
- swap operations: xchg(), cmpxchg() and try_cmpxchg()
|
||||
|
||||
- misc; the special purpose operations that are commonly used and would,
|
||||
given the interface, normally be implemented using (try_)cmpxchg loops but
|
||||
are time critical and can, (typically) on LL/SC architectures, be more
|
||||
efficiently implemented.
|
||||
|
||||
All these operations are SMP atomic; that is, the operations (for a single
|
||||
atomic variable) can be fully ordered and no intermediate state is lost or
|
||||
visible.
|
||||
|
||||
|
||||
ORDERING (go read memory-barriers.txt first)
|
||||
--------
|
||||
|
||||
The rule of thumb:
|
||||
|
||||
- non-RMW operations are unordered;
|
||||
|
||||
- RMW operations that have no return value are unordered;
|
||||
|
||||
- RMW operations that have a return value are fully ordered;
|
||||
|
||||
- RMW operations that are conditional are unordered on FAILURE,
|
||||
otherwise the above rules apply.
|
||||
|
||||
Except of course when an operation has an explicit ordering like:
|
||||
|
||||
{}_relaxed: unordered
|
||||
{}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
|
||||
{}_release: the W of the RMW (or atomic_set) is a RELEASE
|
||||
|
||||
Where 'unordered' is against other memory locations. Address dependencies are
|
||||
not defeated.
|
||||
|
||||
Fully ordered primitives are ordered against everything prior and everything
|
||||
subsequent. Therefore a fully ordered primitive is like having an smp_mb()
|
||||
before and an smp_mb() after the primitive.
|
||||
|
||||
|
||||
The barriers:
|
||||
|
||||
smp_mb__{before,after}_atomic()
|
||||
|
||||
only apply to the RMW ops and can be used to augment/upgrade the ordering
|
||||
inherent to the used atomic op. These barriers provide a full smp_mb().
|
||||
|
||||
These helper barriers exist because architectures have varying implicit
|
||||
ordering on their SMP atomic primitives. For example our TSO architectures
|
||||
provide full ordered atomics and these barriers are no-ops.
|
||||
|
||||
Thus:
|
||||
|
||||
atomic_fetch_add();
|
||||
|
||||
is equivalent to:
|
||||
|
||||
smp_mb__before_atomic();
|
||||
atomic_fetch_add_relaxed();
|
||||
smp_mb__after_atomic();
|
||||
|
||||
However the atomic_fetch_add() might be implemented more efficiently.
|
||||
|
||||
Further, while something like:
|
||||
|
||||
smp_mb__before_atomic();
|
||||
atomic_dec(&X);
|
||||
|
||||
is a 'typical' RELEASE pattern, the barrier is strictly stronger than
|
||||
a RELEASE. Similarly for something like:
|
||||
|
||||
atomic_inc(&X);
|
||||
smp_mb__after_atomic();
|
||||
|
||||
is an ACQUIRE pattern (though very much not typical), but again the barrier is
|
||||
strictly stronger than ACQUIRE. As illustrated:
|
||||
|
||||
C strong-acquire
|
||||
|
||||
{
|
||||
}
|
||||
|
||||
P1(int *x, atomic_t *y)
|
||||
{
|
||||
r0 = READ_ONCE(*x);
|
||||
smp_rmb();
|
||||
r1 = atomic_read(y);
|
||||
}
|
||||
|
||||
P2(int *x, atomic_t *y)
|
||||
{
|
||||
atomic_inc(y);
|
||||
smp_mb__after_atomic();
|
||||
WRITE_ONCE(*x, 1);
|
||||
}
|
||||
|
||||
exists
|
||||
(r0=1 /\ r1=0)
|
||||
|
||||
This should not happen; but a hypothetical atomic_inc_acquire() --
|
||||
(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
|
||||
since then:
|
||||
|
||||
P1 P2
|
||||
|
||||
t = LL.acq *y (0)
|
||||
t++;
|
||||
*x = 1;
|
||||
r0 = *x (1)
|
||||
RMB
|
||||
r1 = *y (0)
|
||||
SC *y, t;
|
||||
|
||||
is allowed.
|
@ -16,14 +16,16 @@ throughput. So, when needed for achieving a lower latency, BFQ builds
|
||||
schedules that may lead to a lower throughput. If your main or only
|
||||
goal, for a given device, is to achieve the maximum-possible
|
||||
throughput at all times, then do switch off all low-latency heuristics
|
||||
for that device, by setting low_latency to 0. Full details in Section 3.
|
||||
for that device, by setting low_latency to 0. See Section 3 for
|
||||
details on how to configure BFQ for the desired tradeoff between
|
||||
latency and throughput, or on how to maximize throughput.
|
||||
|
||||
On average CPUs, the current version of BFQ can handle devices
|
||||
performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
|
||||
reference, 30-50 KIOPS correspond to very high bandwidths with
|
||||
sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
|
||||
to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
|
||||
multi-queue devices.
|
||||
to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on
|
||||
multi-queue devices too.
|
||||
|
||||
The table of contents follow. Impatients can just jump to Section 3.
|
||||
|
||||
@ -33,7 +35,7 @@ CONTENTS
|
||||
1-1 Personal systems
|
||||
1-2 Server systems
|
||||
2. How does BFQ work?
|
||||
3. What are BFQ's tunable?
|
||||
3. What are BFQ's tunables and how to properly configure BFQ?
|
||||
4. BFQ group scheduling
|
||||
4-1 Service guarantees provided
|
||||
4-2 Interface
|
||||
@ -145,19 +147,28 @@ plus a lot of code, are borrowed from CFQ.
|
||||
contrast, BFQ may idle the device for a short time interval,
|
||||
giving the process the chance to go on being served if it issues
|
||||
a new request in time. Device idling typically boosts the
|
||||
throughput on rotational devices, if processes do synchronous
|
||||
and sequential I/O. In addition, under BFQ, device idling is
|
||||
also instrumental in guaranteeing the desired throughput
|
||||
fraction to processes issuing sync requests (see the description
|
||||
of the slice_idle tunable in this document, or [1, 2], for more
|
||||
details).
|
||||
throughput on rotational devices and on non-queueing flash-based
|
||||
devices, if processes do synchronous and sequential I/O. In
|
||||
addition, under BFQ, device idling is also instrumental in
|
||||
guaranteeing the desired throughput fraction to processes
|
||||
issuing sync requests (see the description of the slice_idle
|
||||
tunable in this document, or [1, 2], for more details).
|
||||
|
||||
- With respect to idling for service guarantees, if several
|
||||
processes are competing for the device at the same time, but
|
||||
all processes (and groups, after the following commit) have
|
||||
the same weight, then BFQ guarantees the expected throughput
|
||||
distribution without ever idling the device. Throughput is
|
||||
thus as high as possible in this common scenario.
|
||||
all processes and groups have the same weight, then BFQ
|
||||
guarantees the expected throughput distribution without ever
|
||||
idling the device. Throughput is thus as high as possible in
|
||||
this common scenario.
|
||||
|
||||
- On flash-based storage with internal queueing of commands
|
||||
(typically NCQ), device idling happens to be always detrimental
|
||||
for throughput. So, with these devices, BFQ performs idling
|
||||
only when strictly needed for service guarantees, i.e., for
|
||||
guaranteeing low latency or fairness. In these cases, overall
|
||||
throughput may be sub-optimal. No solution currently exists to
|
||||
provide both strong service guarantees and optimal throughput
|
||||
on devices with internal queueing.
|
||||
|
||||
- If low-latency mode is enabled (default configuration), BFQ
|
||||
executes some special heuristics to detect interactive and soft
|
||||
@ -191,10 +202,7 @@ plus a lot of code, are borrowed from CFQ.
|
||||
- Queues are scheduled according to a variant of WF2Q+, named
|
||||
B-WF2Q+, and implemented using an augmented rb-tree to preserve an
|
||||
O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
|
||||
also ready for hierarchical scheduling. However, for a cleaner
|
||||
logical breakdown, the code that enables and completes
|
||||
hierarchical support is provided in the next commit, which focuses
|
||||
exactly on this feature.
|
||||
also ready for hierarchical scheduling, details in Section 4.
|
||||
|
||||
- B-WF2Q+ guarantees a tight deviation with respect to an ideal,
|
||||
perfectly fair, and smooth service. In particular, B-WF2Q+
|
||||
@ -249,13 +257,24 @@ plus a lot of code, are borrowed from CFQ.
|
||||
the Idle class, to prevent it from starving.
|
||||
|
||||
|
||||
3. What are BFQ's tunable?
|
||||
==========================
|
||||
3. What are BFQ's tunables and how to properly configure BFQ?
|
||||
=============================================================
|
||||
|
||||
The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
|
||||
fifo_expire_sync below are the same as in CFQ. Their description is
|
||||
just copied from that for CFQ. Some considerations in the description
|
||||
of slice_idle are copied from CFQ too.
|
||||
Most BFQ tunables affect service guarantees (basically latency and
|
||||
fairness) and throughput. For full details on how to choose the
|
||||
desired tradeoff between service guarantees and throughput, see the
|
||||
parameters slice_idle, strict_guarantees and low_latency. For details
|
||||
on how to maximise throughput, see slice_idle, timeout_sync and
|
||||
max_budget. The other performance-related parameters have been
|
||||
inherited from, and have been preserved mostly for compatibility with
|
||||
CFQ. So far, no performance improvement has been reported after
|
||||
changing the latter parameters in BFQ.
|
||||
|
||||
In particular, the tunables back_seek-max, back_seek_penalty,
|
||||
fifo_expire_async and fifo_expire_sync below are the same as in
|
||||
CFQ. Their description is just copied from that for CFQ. Some
|
||||
considerations in the description of slice_idle are copied from CFQ
|
||||
too.
|
||||
|
||||
per-process ioprio and weight
|
||||
-----------------------------
|
||||
@ -285,15 +304,17 @@ number of seeks and see improved throughput.
|
||||
|
||||
Setting slice_idle to 0 will remove all the idling on queues and one
|
||||
should see an overall improved throughput on faster storage devices
|
||||
like multiple SATA/SAS disks in hardware RAID configuration.
|
||||
like multiple SATA/SAS disks in hardware RAID configuration, as well
|
||||
as flash-based storage with internal command queueing (and
|
||||
parallelism).
|
||||
|
||||
So depending on storage and workload, it might be useful to set
|
||||
slice_idle=0. In general for SATA/SAS disks and software RAID of
|
||||
SATA/SAS disks keeping slice_idle enabled should be useful. For any
|
||||
configurations where there are multiple spindles behind single LUN
|
||||
(Host based hardware RAID controller or for storage arrays), setting
|
||||
slice_idle=0 might end up in better throughput and acceptable
|
||||
latencies.
|
||||
(Host based hardware RAID controller or for storage arrays), or with
|
||||
flash-based fast storage, setting slice_idle=0 might end up in better
|
||||
throughput and acceptable latencies.
|
||||
|
||||
Idling is however necessary to have service guarantees enforced in
|
||||
case of differentiated weights or differentiated I/O-request lengths.
|
||||
@ -312,13 +333,14 @@ There is an important flipside for idling: apart from the above cases
|
||||
where it is beneficial also for throughput, idling can severely impact
|
||||
throughput. One important case is random workload. Because of this
|
||||
issue, BFQ tends to avoid idling as much as possible, when it is not
|
||||
beneficial also for throughput. As a consequence of this behavior, and
|
||||
of further issues described for the strict_guarantees tunable,
|
||||
short-term service guarantees may be occasionally violated. And, in
|
||||
some cases, these guarantees may be more important than guaranteeing
|
||||
maximum throughput. For example, in video playing/streaming, a very
|
||||
low drop rate may be more important than maximum throughput. In these
|
||||
cases, consider setting the strict_guarantees parameter.
|
||||
beneficial also for throughput (as detailed in Section 2). As a
|
||||
consequence of this behavior, and of further issues described for the
|
||||
strict_guarantees tunable, short-term service guarantees may be
|
||||
occasionally violated. And, in some cases, these guarantees may be
|
||||
more important than guaranteeing maximum throughput. For example, in
|
||||
video playing/streaming, a very low drop rate may be more important
|
||||
than maximum throughput. In these cases, consider setting the
|
||||
strict_guarantees parameter.
|
||||
|
||||
strict_guarantees
|
||||
-----------------
|
||||
@ -420,6 +442,13 @@ The default value is 0, which enables auto-tuning: BFQ sets max_budget
|
||||
to the maximum number of sectors that can be served during
|
||||
timeout_sync, according to the estimated peak rate.
|
||||
|
||||
For specific devices, some users have occasionally reported to have
|
||||
reached a higher throughput by setting max_budget explicitly, i.e., by
|
||||
setting max_budget to a higher value than 0. In particular, they have
|
||||
set max_budget to higher values than those to which BFQ would have set
|
||||
it with auto-tuning. An alternative way to achieve this goal is to
|
||||
just increase the value of timeout_sync, leaving max_budget equal to 0.
|
||||
|
||||
weights
|
||||
-------
|
||||
|
||||
@ -427,51 +456,6 @@ Read-only parameter, used to show the weights of the currently active
|
||||
BFQ queues.
|
||||
|
||||
|
||||
wr_ tunables
|
||||
------------
|
||||
|
||||
BFQ exports a few parameters to control/tune the behavior of
|
||||
low-latency heuristics.
|
||||
|
||||
wr_coeff
|
||||
|
||||
Factor by which the weight of a weight-raised queue is multiplied. If
|
||||
the queue is deemed soft real-time, then the weight is further
|
||||
multiplied by an additional, constant factor.
|
||||
|
||||
wr_max_time
|
||||
|
||||
Maximum duration of a weight-raising period for an interactive task
|
||||
(ms). If set to zero (default value), then this value is computed
|
||||
automatically, as a function of the peak rate of the device. In any
|
||||
case, when the value of this parameter is read, it always reports the
|
||||
current duration, regardless of whether it has been set manually or
|
||||
computed automatically.
|
||||
|
||||
wr_max_softrt_rate
|
||||
|
||||
Maximum service rate below which a queue is deemed to be associated
|
||||
with a soft real-time application, and is then weight-raised
|
||||
accordingly (sectors/sec).
|
||||
|
||||
wr_min_idle_time
|
||||
|
||||
Minimum idle period after which interactive weight-raising may be
|
||||
reactivated for a queue (in ms).
|
||||
|
||||
wr_rt_max_time
|
||||
|
||||
Maximum weight-raising duration for soft real-time queues (in ms). The
|
||||
start time from which this duration is considered is automatically
|
||||
moved forward if the queue is detected to be still soft real-time
|
||||
before the current soft real-time weight-raising period finishes.
|
||||
|
||||
wr_min_inter_arr_async
|
||||
|
||||
Minimum period between I/O request arrivals after which weight-raising
|
||||
may be reactivated for an already busy async queue (in ms).
|
||||
|
||||
|
||||
4. Group scheduling with BFQ
|
||||
============================
|
||||
|
||||
|
@ -1,194 +0,0 @@
|
||||
This driver is for Compaq's SMART Array Controllers.
|
||||
|
||||
Supported Cards:
|
||||
----------------
|
||||
|
||||
This driver is known to work with the following cards:
|
||||
|
||||
* SA 5300
|
||||
* SA 5i
|
||||
* SA 532
|
||||
* SA 5312
|
||||
* SA 641
|
||||
* SA 642
|
||||
* SA 6400
|
||||
* SA 6400 U320 Expansion Module
|
||||
* SA 6i
|
||||
* SA P600
|
||||
* SA P800
|
||||
* SA E400
|
||||
* SA P400i
|
||||
* SA E200
|
||||
* SA E200i
|
||||
* SA E500
|
||||
* SA P700m
|
||||
* SA P212
|
||||
* SA P410
|
||||
* SA P410i
|
||||
* SA P411
|
||||
* SA P812
|
||||
* SA P712m
|
||||
* SA P711m
|
||||
|
||||
Detecting drive failures:
|
||||
-------------------------
|
||||
|
||||
To get the status of logical volumes and to detect physical drive
|
||||
failures, you can use the cciss_vol_status program found here:
|
||||
http://cciss.sourceforge.net/#cciss_utils
|
||||
|
||||
Device Naming:
|
||||
--------------
|
||||
|
||||
If nodes are not already created in the /dev/cciss directory, run as root:
|
||||
|
||||
# cd /dev
|
||||
# ./MAKEDEV cciss
|
||||
|
||||
You need some entries in /dev for the cciss device. The MAKEDEV script
|
||||
can make device nodes for you automatically. Currently the device setup
|
||||
is as follows:
|
||||
|
||||
Major numbers:
|
||||
104 cciss0
|
||||
105 cciss1
|
||||
106 cciss2
|
||||
105 cciss3
|
||||
108 cciss4
|
||||
109 cciss5
|
||||
110 cciss6
|
||||
111 cciss7
|
||||
|
||||
Minor numbers:
|
||||
b7 b6 b5 b4 b3 b2 b1 b0
|
||||
|----+----| |----+----|
|
||||
| |
|
||||
| +-------- Partition ID (0=wholedev, 1-15 partition)
|
||||
|
|
||||
+-------------------- Logical Volume number
|
||||
|
||||
The device naming scheme is:
|
||||
/dev/cciss/c0d0 Controller 0, disk 0, whole device
|
||||
/dev/cciss/c0d0p1 Controller 0, disk 0, partition 1
|
||||
/dev/cciss/c0d0p2 Controller 0, disk 0, partition 2
|
||||
/dev/cciss/c0d0p3 Controller 0, disk 0, partition 3
|
||||
|
||||
/dev/cciss/c1d1 Controller 1, disk 1, whole device
|
||||
/dev/cciss/c1d1p1 Controller 1, disk 1, partition 1
|
||||
/dev/cciss/c1d1p2 Controller 1, disk 1, partition 2
|
||||
/dev/cciss/c1d1p3 Controller 1, disk 1, partition 3
|
||||
|
||||
CCISS simple mode support
|
||||
-------------------------
|
||||
|
||||
The "cciss_simple_mode=1" boot parameter may be used to prevent the driver
|
||||
from putting the controller into "performant" mode. The difference is that
|
||||
with simple mode, each command completion requires an interrupt, while with
|
||||
"performant mode" (the default, and ordinarily better performing) it is
|
||||
possible to have multiple command completions indicated by a single
|
||||
interrupt.
|
||||
|
||||
SCSI tape drive and medium changer support
|
||||
------------------------------------------
|
||||
|
||||
SCSI sequential access devices and medium changer devices are supported and
|
||||
appropriate device nodes are automatically created. (e.g.
|
||||
/dev/st0, /dev/st1, etc. See the "st" man page for more details.)
|
||||
You must enable "SCSI tape drive support for Smart Array 5xxx" and
|
||||
"SCSI support" in your kernel configuration to be able to use SCSI
|
||||
tape drives with your Smart Array 5xxx controller.
|
||||
|
||||
Additionally, note that the driver will engage the SCSI core at init
|
||||
time if any tape drives or medium changers are detected. The driver may
|
||||
also be directed to dynamically engage the SCSI core via the /proc filesystem
|
||||
entry which the "block" side of the driver creates as
|
||||
/proc/driver/cciss/cciss* at runtime. This is best done via a script.
|
||||
|
||||
For example:
|
||||
|
||||
for x in /proc/driver/cciss/cciss[0-9]*
|
||||
do
|
||||
echo "engage scsi" > $x
|
||||
done
|
||||
|
||||
Once the SCSI core is engaged by the driver, it cannot be disengaged
|
||||
(except by unloading the driver, if it happens to be linked as a module.)
|
||||
|
||||
Note also that if no sequential access devices or medium changers are
|
||||
detected, the SCSI core will not be engaged by the action of the above
|
||||
script.
|
||||
|
||||
Hot plug support for SCSI tape drives
|
||||
-------------------------------------
|
||||
|
||||
Hot plugging of SCSI tape drives is supported, with some caveats.
|
||||
The cciss driver must be informed that changes to the SCSI bus
|
||||
have been made. This may be done via the /proc filesystem.
|
||||
For example:
|
||||
|
||||
echo "rescan" > /proc/scsi/cciss0/1
|
||||
|
||||
This causes the driver to query the adapter about changes to the
|
||||
physical SCSI buses and/or fibre channel arbitrated loop and the
|
||||
driver to make note of any new or removed sequential access devices
|
||||
or medium changers. The driver will output messages indicating what
|
||||
devices have been added or removed and the controller, bus, target and
|
||||
lun used to address the device. It then notifies the SCSI mid layer
|
||||
of these changes.
|
||||
|
||||
Note that the naming convention of the /proc filesystem entries
|
||||
contains a number in addition to the driver name. (E.g. "cciss0"
|
||||
instead of just "cciss" which you might expect.)
|
||||
|
||||
Note: ONLY sequential access devices and medium changers are presented
|
||||
as SCSI devices to the SCSI mid layer by the cciss driver. Specifically,
|
||||
physical SCSI disk drives are NOT presented to the SCSI mid layer. The
|
||||
physical SCSI disk drives are controlled directly by the array controller
|
||||
hardware and it is important to prevent the kernel from attempting to directly
|
||||
access these devices too, as if the array controller were merely a SCSI
|
||||
controller in the same way that we are allowing it to access SCSI tape drives.
|
||||
|
||||
SCSI error handling for tape drives and medium changers
|
||||
-------------------------------------------------------
|
||||
|
||||
The linux SCSI mid layer provides an error handling protocol which
|
||||
kicks into gear whenever a SCSI command fails to complete within a
|
||||
certain amount of time (which can vary depending on the command).
|
||||
The cciss driver participates in this protocol to some extent. The
|
||||
normal protocol is a four step process. First the device is told
|
||||
to abort the command. If that doesn't work, the device is reset.
|
||||
If that doesn't work, the SCSI bus is reset. If that doesn't work
|
||||
the host bus adapter is reset. Because the cciss driver is a block
|
||||
driver as well as a SCSI driver and only the tape drives and medium
|
||||
changers are presented to the SCSI mid layer, and unlike more
|
||||
straightforward SCSI drivers, disk i/o continues through the block
|
||||
side during the SCSI error recovery process, the cciss driver only
|
||||
implements the first two of these actions, aborting the command, and
|
||||
resetting the device. Additionally, most tape drives will not oblige
|
||||
in aborting commands, and sometimes it appears they will not even
|
||||
obey a reset command, though in most circumstances they will. In
|
||||
the case that the command cannot be aborted and the device cannot be
|
||||
reset, the device will be set offline.
|
||||
|
||||
In the event the error handling code is triggered and a tape drive is
|
||||
successfully reset or the tardy command is successfully aborted, the
|
||||
tape drive may still not allow i/o to continue until some command
|
||||
is issued which positions the tape to a known position. Typically you
|
||||
must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example)
|
||||
before i/o can proceed again to a tape drive which was reset.
|
||||
|
||||
There is a cciss_tape_cmds module parameter which can be used to make cciss
|
||||
allocate more commands for use by tape drives. Ordinarily only a few commands
|
||||
(6) are allocated for tape drives because tape drives are slow and
|
||||
infrequently used and the primary purpose of Smart Array controllers is to
|
||||
act as a RAID controller for disk drives, so the vast majority of commands
|
||||
are allocated for disk devices. However, if you have more than a few tape
|
||||
drives attached to a smart array, the default number of commands may not be
|
||||
enough (for example, if you have 8 tape drives, you could only rewind 6
|
||||
at one time with the default number of commands.) The cciss_tape_cmds module
|
||||
parameter allows more commands (up to 16 more) to be allocated for use by
|
||||
tape drives. For example:
|
||||
|
||||
insmod cciss.ko cciss_tape_cmds=16
|
||||
|
||||
Or, as a kernel boot parameter passed in via grub: cciss.cciss_tape_cmds=8
|
@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
|
||||
|
||||
User space is advised to use the following files to read the device statistics.
|
||||
@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace:
|
||||
resets the disksize to zero. You must set the disksize again
|
||||
before reusing the device.
|
||||
|
||||
* Optional Feature
|
||||
|
||||
= writeback
|
||||
|
||||
With incompressible pages, there is no memory saving with zram.
|
||||
Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page
|
||||
to backing storage rather than keeping it in memory.
|
||||
User should set up backing device via /sys/block/zramX/backing_dev
|
||||
before disksize setting.
|
||||
|
||||
Nitin Gupta
|
||||
ngupta@vflare.org
|
||||
|
@ -18,7 +18,9 @@ v1 is available under Documentation/cgroup-v1/.
|
||||
1-2. What is cgroup?
|
||||
2. Basic Operations
|
||||
2-1. Mounting
|
||||
2-2. Organizing Processes
|
||||
2-2. Organizing Processes and Threads
|
||||
2-2-1. Processes
|
||||
2-2-2. Threads
|
||||
2-3. [Un]populated Notification
|
||||
2-4. Controlling Controllers
|
||||
2-4-1. Enabling and Disabling
|
||||
@ -167,8 +169,11 @@ cgroup v2 currently supports the following mount options.
|
||||
Delegation section for details.
|
||||
|
||||
|
||||
Organizing Processes
|
||||
--------------------
|
||||
Organizing Processes and Threads
|
||||
--------------------------------
|
||||
|
||||
Processes
|
||||
~~~~~~~~~
|
||||
|
||||
Initially, only the root cgroup exists to which all processes belong.
|
||||
A child cgroup can be created by creating a sub-directory::
|
||||
@ -219,6 +224,105 @@ is removed subsequently, " (deleted)" is appended to the path::
|
||||
0::/test-cgroup/test-cgroup-nested (deleted)
|
||||
|
||||
|
||||
Threads
|
||||
~~~~~~~
|
||||
|
||||
cgroup v2 supports thread granularity for a subset of controllers to
|
||||
support use cases requiring hierarchical resource distribution across
|
||||
the threads of a group of processes. By default, all threads of a
|
||||
process belong to the same cgroup, which also serves as the resource
|
||||
domain to host resource consumptions which are not specific to a
|
||||
process or thread. The thread mode allows threads to be spread across
|
||||
a subtree while still maintaining the common resource domain for them.
|
||||
|
||||
Controllers which support thread mode are called threaded controllers.
|
||||
The ones which don't are called domain controllers.
|
||||
|
||||
Marking a cgroup threaded makes it join the resource domain of its
|
||||
parent as a threaded cgroup. The parent may be another threaded
|
||||
cgroup whose resource domain is further up in the hierarchy. The root
|
||||
of a threaded subtree, that is, the nearest ancestor which is not
|
||||
threaded, is called threaded domain or thread root interchangeably and
|
||||
serves as the resource domain for the entire subtree.
|
||||
|
||||
Inside a threaded subtree, threads of a process can be put in
|
||||
different cgroups and are not subject to the no internal process
|
||||
constraint - threaded controllers can be enabled on non-leaf cgroups
|
||||
whether they have threads in them or not.
|
||||
|
||||
As the threaded domain cgroup hosts all the domain resource
|
||||
consumptions of the subtree, it is considered to have internal
|
||||
resource consumptions whether there are processes in it or not and
|
||||
can't have populated child cgroups which aren't threaded. Because the
|
||||
root cgroup is not subject to no internal process constraint, it can
|
||||
serve both as a threaded domain and a parent to domain cgroups.
|
||||
|
||||
The current operation mode or type of the cgroup is shown in the
|
||||
"cgroup.type" file which indicates whether the cgroup is a normal
|
||||
domain, a domain which is serving as the domain of a threaded subtree,
|
||||
or a threaded cgroup.
|
||||
|
||||
On creation, a cgroup is always a domain cgroup and can be made
|
||||
threaded by writing "threaded" to the "cgroup.type" file. The
|
||||
operation is single direction::
|
||||
|
||||
# echo threaded > cgroup.type
|
||||
|
||||
Once threaded, the cgroup can't be made a domain again. To enable the
|
||||
thread mode, the following conditions must be met.
|
||||
|
||||
- As the cgroup will join the parent's resource domain. The parent
|
||||
must either be a valid (threaded) domain or a threaded cgroup.
|
||||
|
||||
- When the parent is an unthreaded domain, it must not have any domain
|
||||
controllers enabled or populated domain children. The root is
|
||||
exempt from this requirement.
|
||||
|
||||
Topology-wise, a cgroup can be in an invalid state. Please consider
|
||||
the following toplogy::
|
||||
|
||||
A (threaded domain) - B (threaded) - C (domain, just created)
|
||||
|
||||
C is created as a domain but isn't connected to a parent which can
|
||||
host child domains. C can't be used until it is turned into a
|
||||
threaded cgroup. "cgroup.type" file will report "domain (invalid)" in
|
||||
these cases. Operations which fail due to invalid topology use
|
||||
EOPNOTSUPP as the errno.
|
||||
|
||||
A domain cgroup is turned into a threaded domain when one of its child
|
||||
cgroup becomes threaded or threaded controllers are enabled in the
|
||||
"cgroup.subtree_control" file while there are processes in the cgroup.
|
||||
A threaded domain reverts to a normal domain when the conditions
|
||||
clear.
|
||||
|
||||
When read, "cgroup.threads" contains the list of the thread IDs of all
|
||||
threads in the cgroup. Except that the operations are per-thread
|
||||
instead of per-process, "cgroup.threads" has the same format and
|
||||
behaves the same way as "cgroup.procs". While "cgroup.threads" can be
|
||||
written to in any cgroup, as it can only move threads inside the same
|
||||
threaded domain, its operations are confined inside each threaded
|
||||
subtree.
|
||||
|
||||
The threaded domain cgroup serves as the resource domain for the whole
|
||||
subtree, and, while the threads can be scattered across the subtree,
|
||||
all the processes are considered to be in the threaded domain cgroup.
|
||||
"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
|
||||
processes in the subtree and is not readable in the subtree proper.
|
||||
However, "cgroup.procs" can be written to from anywhere in the subtree
|
||||
to migrate all threads of the matching process to the cgroup.
|
||||
|
||||
Only threaded controllers can be enabled in a threaded subtree. When
|
||||
a threaded controller is enabled inside a threaded subtree, it only
|
||||
accounts for and controls resource consumptions associated with the
|
||||
threads in the cgroup and its descendants. All consumptions which
|
||||
aren't tied to a specific thread belong to the threaded domain cgroup.
|
||||
|
||||
Because a threaded subtree is exempt from no internal process
|
||||
constraint, a threaded controller must be able to handle competition
|
||||
between threads in a non-leaf cgroup and its child cgroups. Each
|
||||
threaded controller defines how such competitions are handled.
|
||||
|
||||
|
||||
[Un]populated Notification
|
||||
--------------------------
|
||||
|
||||
@ -302,15 +406,15 @@ disabled if one or more children have it enabled.
|
||||
No Internal Process Constraint
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Non-root cgroups can only distribute resources to their children when
|
||||
they don't have any processes of their own. In other words, only
|
||||
cgroups which don't contain any processes can have controllers enabled
|
||||
in their "cgroup.subtree_control" files.
|
||||
Non-root cgroups can distribute domain resources to their children
|
||||
only when they don't have any processes of their own. In other words,
|
||||
only domain cgroups which don't contain any processes can have domain
|
||||
controllers enabled in their "cgroup.subtree_control" files.
|
||||
|
||||
This guarantees that, when a controller is looking at the part of the
|
||||
hierarchy which has it enabled, processes are always only on the
|
||||
leaves. This rules out situations where child cgroups compete against
|
||||
internal processes of the parent.
|
||||
This guarantees that, when a domain controller is looking at the part
|
||||
of the hierarchy which has it enabled, processes are always only on
|
||||
the leaves. This rules out situations where child cgroups compete
|
||||
against internal processes of the parent.
|
||||
|
||||
The root cgroup is exempt from this restriction. Root contains
|
||||
processes and anonymous resource consumption which can't be associated
|
||||
@ -334,10 +438,10 @@ Model of Delegation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A cgroup can be delegated in two ways. First, to a less privileged
|
||||
user by granting write access of the directory and its "cgroup.procs"
|
||||
and "cgroup.subtree_control" files to the user. Second, if the
|
||||
"nsdelegate" mount option is set, automatically to a cgroup namespace
|
||||
on namespace creation.
|
||||
user by granting write access of the directory and its "cgroup.procs",
|
||||
"cgroup.threads" and "cgroup.subtree_control" files to the user.
|
||||
Second, if the "nsdelegate" mount option is set, automatically to a
|
||||
cgroup namespace on namespace creation.
|
||||
|
||||
Because the resource control interface files in a given directory
|
||||
control the distribution of the parent's resources, the delegatee
|
||||
@ -644,6 +748,29 @@ Core Interface Files
|
||||
|
||||
All cgroup core files are prefixed with "cgroup."
|
||||
|
||||
cgroup.type
|
||||
|
||||
A read-write single value file which exists on non-root
|
||||
cgroups.
|
||||
|
||||
When read, it indicates the current type of the cgroup, which
|
||||
can be one of the following values.
|
||||
|
||||
- "domain" : A normal valid domain cgroup.
|
||||
|
||||
- "domain threaded" : A threaded domain cgroup which is
|
||||
serving as the root of a threaded subtree.
|
||||
|
||||
- "domain invalid" : A cgroup which is in an invalid state.
|
||||
It can't be populated or have controllers enabled. It may
|
||||
be allowed to become a threaded cgroup.
|
||||
|
||||
- "threaded" : A threaded cgroup which is a member of a
|
||||
threaded subtree.
|
||||
|
||||
A cgroup can be turned into a threaded cgroup by writing
|
||||
"threaded" to this file.
|
||||
|
||||
cgroup.procs
|
||||
A read-write new-line separated values file which exists on
|
||||
all cgroups.
|
||||
@ -658,9 +785,6 @@ All cgroup core files are prefixed with "cgroup."
|
||||
the PID to the cgroup. The writer should match all of the
|
||||
following conditions.
|
||||
|
||||
- Its euid is either root or must match either uid or suid of
|
||||
the target process.
|
||||
|
||||
- It must have write access to the "cgroup.procs" file.
|
||||
|
||||
- It must have write access to the "cgroup.procs" file of the
|
||||
@ -669,6 +793,35 @@ All cgroup core files are prefixed with "cgroup."
|
||||
When delegating a sub-hierarchy, write access to this file
|
||||
should be granted along with the containing directory.
|
||||
|
||||
In a threaded cgroup, reading this file fails with EOPNOTSUPP
|
||||
as all the processes belong to the thread root. Writing is
|
||||
supported and moves every thread of the process to the cgroup.
|
||||
|
||||
cgroup.threads
|
||||
A read-write new-line separated values file which exists on
|
||||
all cgroups.
|
||||
|
||||
When read, it lists the TIDs of all threads which belong to
|
||||
the cgroup one-per-line. The TIDs are not ordered and the
|
||||
same TID may show up more than once if the thread got moved to
|
||||
another cgroup and then back or the TID got recycled while
|
||||
reading.
|
||||
|
||||
A TID can be written to migrate the thread associated with the
|
||||
TID to the cgroup. The writer should match all of the
|
||||
following conditions.
|
||||
|
||||
- It must have write access to the "cgroup.threads" file.
|
||||
|
||||
- The cgroup that the thread is currently in must be in the
|
||||
same resource domain as the destination cgroup.
|
||||
|
||||
- It must have write access to the "cgroup.procs" file of the
|
||||
common ancestor of the source and destination cgroups.
|
||||
|
||||
When delegating a sub-hierarchy, write access to this file
|
||||
should be granted along with the containing directory.
|
||||
|
||||
cgroup.controllers
|
||||
A read-only space separated values file which exists on all
|
||||
cgroups.
|
||||
@ -701,6 +854,38 @@ All cgroup core files are prefixed with "cgroup."
|
||||
1 if the cgroup or its descendants contains any live
|
||||
processes; otherwise, 0.
|
||||
|
||||
cgroup.max.descendants
|
||||
A read-write single value files. The default is "max".
|
||||
|
||||
Maximum allowed number of descent cgroups.
|
||||
If the actual number of descendants is equal or larger,
|
||||
an attempt to create a new cgroup in the hierarchy will fail.
|
||||
|
||||
cgroup.max.depth
|
||||
A read-write single value files. The default is "max".
|
||||
|
||||
Maximum allowed descent depth below the current cgroup.
|
||||
If the actual descent depth is equal or larger,
|
||||
an attempt to create a new child cgroup will fail.
|
||||
|
||||
cgroup.stat
|
||||
A read-only flat-keyed file with the following entries:
|
||||
|
||||
nr_descendants
|
||||
Total number of visible descendant cgroups.
|
||||
|
||||
nr_dying_descendants
|
||||
Total number of dying descendant cgroups. A cgroup becomes
|
||||
dying after being deleted by a user. The cgroup will remain
|
||||
in dying state for some time undefined time (which can depend
|
||||
on system load) before being completely destroyed.
|
||||
|
||||
A process can't enter a dying cgroup under any circumstances,
|
||||
a dying cgroup can't revive.
|
||||
|
||||
A dying cgroup can consume system resources not exceeding
|
||||
limits, which were active at the moment of cgroup deletion.
|
||||
|
||||
|
||||
Controllers
|
||||
===========
|
||||
|
@ -29,7 +29,7 @@ from load_config import loadConfig
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
needs_sphinx = '1.2'
|
||||
needs_sphinx = '1.3'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
@ -271,10 +271,29 @@ latex_elements = {
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
'preamble': '''
|
||||
\\usepackage{ifthen}
|
||||
% Use some font with UTF-8 support with XeLaTeX
|
||||
\\usepackage{fontspec}
|
||||
\\setsansfont{DejaVu Serif}
|
||||
\\setromanfont{DejaVu Sans}
|
||||
\\setmonofont{DejaVu Sans Mono}
|
||||
|
||||
% Allow generate some pages in landscape
|
||||
\\usepackage{lscape}
|
||||
'''
|
||||
}
|
||||
|
||||
# Fix reference escape troubles with Sphinx 1.4.x
|
||||
if major == 1 and minor > 3:
|
||||
latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
|
||||
|
||||
if major == 1 and minor <= 4:
|
||||
latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
|
||||
elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
|
||||
latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in'
|
||||
latex_elements['preamble'] += '\\fvset{fontsize=auto}\n'
|
||||
|
||||
# Customize notice background colors on Sphinx < 1.6:
|
||||
if major == 1 and minor < 6:
|
||||
latex_elements['preamble'] += '''
|
||||
\\usepackage{ifthen}
|
||||
|
||||
% Put notes in color and let them be inside a table
|
||||
\\definecolor{NoteColor}{RGB}{204,255,255}
|
||||
@ -325,27 +344,26 @@ latex_elements = {
|
||||
}
|
||||
\\makeatother
|
||||
|
||||
% Use some font with UTF-8 support with XeLaTeX
|
||||
\\usepackage{fontspec}
|
||||
\\setsansfont{DejaVu Serif}
|
||||
\\setromanfont{DejaVu Sans}
|
||||
\\setmonofont{DejaVu Sans Mono}
|
||||
|
||||
% To allow adjusting table sizes
|
||||
\\usepackage{adjustbox}
|
||||
|
||||
'''
|
||||
}
|
||||
|
||||
# Fix reference escape troubles with Sphinx 1.4.x
|
||||
if major == 1 and minor > 3:
|
||||
latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
|
||||
|
||||
if major == 1 and minor <= 4:
|
||||
latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
|
||||
elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
|
||||
latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=0.5in'
|
||||
|
||||
# With Sphinx 1.6, it is possible to change the Bg color directly
|
||||
# by using:
|
||||
# \definecolor{sphinxnoteBgColor}{RGB}{204,255,255}
|
||||
# \definecolor{sphinxwarningBgColor}{RGB}{255,204,204}
|
||||
# \definecolor{sphinxattentionBgColor}{RGB}{255,255,204}
|
||||
# \definecolor{sphinximportantBgColor}{RGB}{192,255,204}
|
||||
#
|
||||
# However, it require to use sphinx heavy box with:
|
||||
#
|
||||
# \renewenvironment{sphinxlightbox} {%
|
||||
# \\begin{sphinxheavybox}
|
||||
# }
|
||||
# \\end{sphinxheavybox}
|
||||
# }
|
||||
#
|
||||
# Unfortunately, the implementation is buggy: if a note is inside a
|
||||
# table, it isn't displayed well. So, for now, let's use boring
|
||||
# black and white notes.
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
|
144
Documentation/core-api/genalloc.rst
Normal file
144
Documentation/core-api/genalloc.rst
Normal file
@ -0,0 +1,144 @@
|
||||
The genalloc/genpool subsystem
|
||||
==============================
|
||||
|
||||
There are a number of memory-allocation subsystems in the kernel, each
|
||||
aimed at a specific need. Sometimes, however, a kernel developer needs to
|
||||
implement a new allocator for a specific range of special-purpose memory;
|
||||
often that memory is located on a device somewhere. The author of the
|
||||
driver for that device can certainly write a little allocator to get the
|
||||
job done, but that is the way to fill the kernel with dozens of poorly
|
||||
tested allocators. Back in 2005, Jes Sorensen lifted one of those
|
||||
allocators from the sym53c8xx_2 driver and posted_ it as a generic module
|
||||
for the creation of ad hoc memory allocators. This code was merged
|
||||
for the 2.6.13 release; it has been modified considerably since then.
|
||||
|
||||
.. _posted: https://lwn.net/Articles/125842/
|
||||
|
||||
Code using this allocator should include <linux/genalloc.h>. The action
|
||||
begins with the creation of a pool using one of:
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_create
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: devm_gen_pool_create
|
||||
|
||||
A call to :c:func:`gen_pool_create` will create a pool. The granularity of
|
||||
allocations is set with min_alloc_order; it is a log-base-2 number like
|
||||
those used by the page allocator, but it refers to bytes rather than pages.
|
||||
So, if min_alloc_order is passed as 3, then all allocations will be a
|
||||
multiple of eight bytes. Increasing min_alloc_order decreases the memory
|
||||
required to track the memory in the pool. The nid parameter specifies
|
||||
which NUMA node should be used for the allocation of the housekeeping
|
||||
structures; it can be -1 if the caller doesn't care.
|
||||
|
||||
The "managed" interface :c:func:`devm_gen_pool_create` ties the pool to a
|
||||
specific device. Among other things, it will automatically clean up the
|
||||
pool when the given device is destroyed.
|
||||
|
||||
A pool is shut down with:
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_destroy
|
||||
|
||||
It's worth noting that, if there are still allocations outstanding from the
|
||||
given pool, this function will take the rather extreme step of invoking
|
||||
BUG(), crashing the entire system. You have been warned.
|
||||
|
||||
A freshly created pool has no memory to allocate. It is fairly useless in
|
||||
that state, so one of the first orders of business is usually to add memory
|
||||
to the pool. That can be done with one of:
|
||||
|
||||
.. kernel-doc:: include/linux/genalloc.h
|
||||
:functions: gen_pool_add
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_add_virt
|
||||
|
||||
A call to :c:func:`gen_pool_add` will place the size bytes of memory
|
||||
starting at addr (in the kernel's virtual address space) into the given
|
||||
pool, once again using nid as the node ID for ancillary memory allocations.
|
||||
The :c:func:`gen_pool_add_virt` variant associates an explicit physical
|
||||
address with the memory; this is only necessary if the pool will be used
|
||||
for DMA allocations.
|
||||
|
||||
The functions for allocating memory from the pool (and putting it back)
|
||||
are:
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_alloc
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_dma_alloc
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_free
|
||||
|
||||
As one would expect, :c:func:`gen_pool_alloc` will allocate size< bytes
|
||||
from the given pool. The :c:func:`gen_pool_dma_alloc` variant allocates
|
||||
memory for use with DMA operations, returning the associated physical
|
||||
address in the space pointed to by dma. This will only work if the memory
|
||||
was added with :c:func:`gen_pool_add_virt`. Note that this function
|
||||
departs from the usual genpool pattern of using unsigned long values to
|
||||
represent kernel addresses; it returns a void * instead.
|
||||
|
||||
That all seems relatively simple; indeed, some developers clearly found it
|
||||
to be too simple. After all, the interface above provides no control over
|
||||
how the allocation functions choose which specific piece of memory to
|
||||
return. If that sort of control is needed, the following functions will be
|
||||
of interest:
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_alloc_algo
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_set_algo
|
||||
|
||||
Allocations with :c:func:`gen_pool_alloc_algo` specify an algorithm to be
|
||||
used to choose the memory to be allocated; the default algorithm can be set
|
||||
with :c:func:`gen_pool_set_algo`. The data value is passed to the
|
||||
algorithm; most ignore it, but it is occasionally needed. One can,
|
||||
naturally, write a special-purpose algorithm, but there is a fair set
|
||||
already available:
|
||||
|
||||
- gen_pool_first_fit is a simple first-fit allocator; this is the default
|
||||
algorithm if none other has been specified.
|
||||
|
||||
- gen_pool_first_fit_align forces the allocation to have a specific
|
||||
alignment (passed via data in a genpool_data_align structure).
|
||||
|
||||
- gen_pool_first_fit_order_align aligns the allocation to the order of the
|
||||
size. A 60-byte allocation will thus be 64-byte aligned, for example.
|
||||
|
||||
- gen_pool_best_fit, as one would expect, is a simple best-fit allocator.
|
||||
|
||||
- gen_pool_fixed_alloc allocates at a specific offset (passed in a
|
||||
genpool_data_fixed structure via the data parameter) within the pool.
|
||||
If the indicated memory is not available the allocation fails.
|
||||
|
||||
There is a handful of other functions, mostly for purposes like querying
|
||||
the space available in the pool or iterating through chunks of memory.
|
||||
Most users, however, should not need much beyond what has been described
|
||||
above. With luck, wider awareness of this module will help to prevent the
|
||||
writing of special-purpose memory allocators in the future.
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_virt_to_phys
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_for_each_chunk
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: addr_in_gen_pool
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_avail
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_size
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: gen_pool_get
|
||||
|
||||
.. kernel-doc:: lib/genalloc.c
|
||||
:functions: of_gen_pool_get
|
@ -20,6 +20,7 @@ Core utilities
|
||||
genericirq
|
||||
flexible-arrays
|
||||
librs
|
||||
genalloc
|
||||
|
||||
Interfaces for kernel debugging
|
||||
===============================
|
||||
|
@ -344,3 +344,52 @@ codecs, and devices with strict requirements for interface clocking.
|
||||
|
||||
.. kernel-doc:: include/linux/clk.h
|
||||
:internal:
|
||||
|
||||
Synchronization Primitives
|
||||
==========================
|
||||
|
||||
Read-Copy Update (RCU)
|
||||
----------------------
|
||||
|
||||
.. kernel-doc:: include/linux/rcupdate.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rcupdate_wait.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rcutree.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/tree.c
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/tree_plugin.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/tree_exp.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/update.c
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/srcu.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/srcutree.c
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rculist_bl.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rculist.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rculist_nulls.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: include/linux/rcu_sync.h
|
||||
:external:
|
||||
|
||||
.. kernel-doc:: kernel/rcu/sync.c
|
||||
:external:
|
||||
|
||||
|
@ -243,11 +243,15 @@ throttling the number of active work items, specifying '0' is
|
||||
recommended.
|
||||
|
||||
Some users depend on the strict execution ordering of ST wq. The
|
||||
combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` is used to
|
||||
achieve this behavior. Work items on such wq are always queued to the
|
||||
unbound worker-pools and only one work item can be active at any given
|
||||
combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to
|
||||
achieve this behavior. Work items on such wq were always queued to the
|
||||
unbound worker-pools and only one work item could be active at any given
|
||||
time thus achieving the same ordering property as ST wq.
|
||||
|
||||
In the current implementation the above configuration only guarantees
|
||||
ST behavior within a given NUMA node. Instead alloc_ordered_queue should
|
||||
be used to achieve system wide ST behavior.
|
||||
|
||||
|
||||
Example Execution Scenarios
|
||||
===========================
|
||||
|
@ -31,11 +31,13 @@ Setup
|
||||
CONFIG_DEBUG_INFO_REDUCED off. If your architecture supports
|
||||
CONFIG_FRAME_POINTER, keep it enabled.
|
||||
|
||||
- Install that kernel on the guest.
|
||||
- Install that kernel on the guest, turn off KASLR if necessary by adding
|
||||
"nokaslr" to the kernel command line.
|
||||
Alternatively, QEMU allows to boot the kernel directly using -kernel,
|
||||
-append, -initrd command line switches. This is generally only useful if
|
||||
you do not depend on modules. See QEMU documentation for more details on
|
||||
this mode.
|
||||
this mode. In this case, you should build the kernel with
|
||||
CONFIG_RANDOMIZE_BASE disabled if the architecture supports KASLR.
|
||||
|
||||
- Enable the gdb stub of QEMU/KVM, either
|
||||
|
||||
|
@ -348,6 +348,15 @@ default behavior is always set to 0.
|
||||
- ``echo 1 > /sys/module/debug_core/parameters/kgdbreboot``
|
||||
- Enter the debugger on reboot notify.
|
||||
|
||||
Kernel parameter: ``nokaslr``
|
||||
-----------------------------
|
||||
|
||||
If the architecture that you are using enable KASLR by default,
|
||||
you should consider turning it off. KASLR randomizes the
|
||||
virtual address where the kernel image is mapped and confuse
|
||||
gdb which resolve kernel symbol address from symbol table
|
||||
of vmlinux.
|
||||
|
||||
Using kdb
|
||||
=========
|
||||
|
||||
@ -358,7 +367,7 @@ This is a quick example of how to use kdb.
|
||||
|
||||
1. Configure kgdboc at boot using kernel parameters::
|
||||
|
||||
console=ttyS0,115200 kgdboc=ttyS0,115200
|
||||
console=ttyS0,115200 kgdboc=ttyS0,115200 nokaslr
|
||||
|
||||
OR
|
||||
|
||||
|
7
Documentation/devicetree/bindings/arc/hsdk.txt
Normal file
7
Documentation/devicetree/bindings/arc/hsdk.txt
Normal file
@ -0,0 +1,7 @@
|
||||
Synopsys DesignWare ARC HS Development Kit Device Tree Bindings
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
ARC HSDK Board with quad-core ARC HS38x4 in silicon.
|
||||
|
||||
Required root node properties:
|
||||
- compatible = "snps,hsdk";
|
@ -1,6 +1,18 @@
|
||||
Amlogic MesonX device tree bindings
|
||||
-------------------------------------------
|
||||
|
||||
Work in progress statement:
|
||||
|
||||
Device tree files and bindings applying to Amlogic SoCs and boards are
|
||||
considered "unstable". Any Amlogic device tree binding may change at
|
||||
any time. Be sure to use a device tree binary and a kernel image
|
||||
generated from the same source tree.
|
||||
|
||||
Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a
|
||||
stable binding/ABI.
|
||||
|
||||
---------------------------------------------------------------
|
||||
|
||||
Boards with the Amlogic Meson6 SoC shall have the following properties:
|
||||
Required root node property:
|
||||
compatible: "amlogic,meson6"
|
||||
@ -61,3 +73,32 @@ Board compatible values (alphabetically, grouped by SoC):
|
||||
- "amlogic,q201" (Meson gxm s912)
|
||||
- "kingnovel,r-box-pro" (Meson gxm S912)
|
||||
- "nexbox,a1" (Meson gxm s912)
|
||||
|
||||
Amlogic Meson Firmware registers Interface
|
||||
------------------------------------------
|
||||
|
||||
The Meson SoCs have a register bank with status and data shared with the
|
||||
secure firmware.
|
||||
|
||||
Required properties:
|
||||
- compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon"
|
||||
|
||||
Properties should indentify components of this register interface :
|
||||
|
||||
Meson GX SoC Information
|
||||
------------------------
|
||||
A firmware register encodes the SoC type, package and revision information on
|
||||
the Meson GX SoCs.
|
||||
If present, the following property should be added :
|
||||
|
||||
Optional properties:
|
||||
- amlogic,has-chip-id: If present, the interface gives the current SoC version.
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
ao-secure@140 {
|
||||
compatible = "amlogic,meson-gx-ao-secure", "syscon";
|
||||
reg = <0x0 0x140 0x0 0x140>;
|
||||
amlogic,has-chip-id;
|
||||
};
|
||||
|
@ -108,6 +108,5 @@ Example:
|
||||
frame-number = <1>
|
||||
interrupts = <0 15 0x8>;
|
||||
reg = <0xf0003000 0x1000>;
|
||||
status = "disabled";
|
||||
};
|
||||
};
|
||||
|
@ -42,6 +42,10 @@ Raspberry Pi Zero
|
||||
Required root node properties:
|
||||
compatible = "raspberrypi,model-zero", "brcm,bcm2835";
|
||||
|
||||
Raspberry Pi Zero W
|
||||
Required root node properties:
|
||||
compatible = "raspberrypi,model-zero-w", "brcm,bcm2835";
|
||||
|
||||
Generic BCM2835 board
|
||||
Required root node properties:
|
||||
compatible = "brcm,bcm2835";
|
||||
|
6
Documentation/devicetree/bindings/arm/bhf.txt
Normal file
6
Documentation/devicetree/bindings/arm/bhf.txt
Normal file
@ -0,0 +1,6 @@
|
||||
Beckhoff Automation Platforms Device Tree Bindings
|
||||
--------------------------------------------------
|
||||
|
||||
CX9020 Embedded PC
|
||||
Required root node properties:
|
||||
- compatible = "bhf,cx9020", "fsl,imx53";
|
@ -34,8 +34,8 @@ its hardware characteristcs.
|
||||
- Embedded Trace Macrocell (version 4.x):
|
||||
"arm,coresight-etm4x", "arm,primecell";
|
||||
|
||||
- Qualcomm Configurable Replicator (version 1.x):
|
||||
"qcom,coresight-replicator1x", "arm,primecell";
|
||||
- Coresight programmable Replicator :
|
||||
"arm,coresight-dynamic-replicator", "arm,primecell";
|
||||
|
||||
- System Trace Macrocell:
|
||||
"arm,coresight-stm", "arm,primecell"; [1]
|
||||
|
@ -200,6 +200,7 @@ described below.
|
||||
"arm,realview-smp"
|
||||
"brcm,bcm11351-cpu-method"
|
||||
"brcm,bcm23550"
|
||||
"brcm,bcm2836-smp"
|
||||
"brcm,bcm-nsp-smp"
|
||||
"brcm,brahma-b15"
|
||||
"marvell,armada-375-smp"
|
||||
|
15
Documentation/devicetree/bindings/arm/marvell/armada-8kp.txt
Normal file
15
Documentation/devicetree/bindings/arm/marvell/armada-8kp.txt
Normal file
@ -0,0 +1,15 @@
|
||||
Marvell Armada 8KPlus Platforms Device Tree Bindings
|
||||
----------------------------------------------------
|
||||
|
||||
Boards using a SoC of the Marvell Armada 8KP families must carry
|
||||
the following root node property:
|
||||
|
||||
- compatible, with one of the following values:
|
||||
|
||||
- "marvell,armada-8080", "marvell,armada-ap810-octa", "marvell,armada-ap810"
|
||||
when the SoC being used is the Armada 8080
|
||||
|
||||
Example:
|
||||
|
||||
compatible = "marvell,armada-8080-db", "marvell,armada-8080",
|
||||
"marvell,armada-ap810-octa", "marvell,armada-ap810"
|
@ -183,7 +183,6 @@ cpm_syscon0: system-controller@440000 {
|
||||
gpio-controller;
|
||||
#gpio-cells = <2>;
|
||||
gpio-ranges = <&cpm_pinctrl 0 0 32>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
};
|
||||
|
@ -1,12 +1,12 @@
|
||||
MediaTek mt65xx, mt67xx & mt81xx Platforms Device Tree Bindings
|
||||
MediaTek SoC based Platforms Device Tree Bindings
|
||||
|
||||
Boards with a MediaTek mt65xx/mt67xx/mt81xx SoC shall have the
|
||||
following property:
|
||||
Boards with a MediaTek SoC shall have the following property:
|
||||
|
||||
Required root node property:
|
||||
|
||||
compatible: Must contain one of
|
||||
"mediatek,mt2701"
|
||||
"mediatek,mt2712"
|
||||
"mediatek,mt6580"
|
||||
"mediatek,mt6589"
|
||||
"mediatek,mt6592"
|
||||
@ -14,7 +14,8 @@ compatible: Must contain one of
|
||||
"mediatek,mt6795"
|
||||
"mediatek,mt6797"
|
||||
"mediatek,mt7622"
|
||||
"mediatek,mt7623"
|
||||
"mediatek,mt7623" which is referred to MT7623N SoC
|
||||
"mediatek,mt7623a"
|
||||
"mediatek,mt8127"
|
||||
"mediatek,mt8135"
|
||||
"mediatek,mt8173"
|
||||
@ -25,6 +26,9 @@ Supported boards:
|
||||
- Evaluation board for MT2701:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt2701-evb", "mediatek,mt2701";
|
||||
- Evaluation board for MT2712:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt2712-evb", "mediatek,mt2712";
|
||||
- Evaluation board for MT6580:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt6580-evbp1", "mediatek,mt6580";
|
||||
@ -46,9 +50,11 @@ Supported boards:
|
||||
- Reference board variant 1 for MT7622:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt7622-rfb1", "mediatek,mt7622";
|
||||
- Evaluation board for MT7623:
|
||||
- Reference board for MT7623n with NAND:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt7623-evb", "mediatek,mt7623";
|
||||
- compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
|
||||
- Bananapi BPI-R2 board:
|
||||
- compatible = "bananapi,bpi-r2", "mediatek,mt7623";
|
||||
- MTK mt8127 tablet moose EVB:
|
||||
Required root node properties:
|
||||
- compatible = "mediatek,mt8127-moose", "mediatek,mt8127";
|
||||
|
@ -80,6 +80,9 @@ SoCs:
|
||||
- OMAP5432
|
||||
compatible = "ti,omap5432", "ti,omap5"
|
||||
|
||||
- DRA762
|
||||
compatible = "ti,dra762", "ti,dra7"
|
||||
|
||||
- DRA742
|
||||
compatible = "ti,dra742", "ti,dra74", "ti,dra7"
|
||||
|
||||
@ -154,6 +157,9 @@ Boards:
|
||||
- AM335X phyCORE-AM335x: Development kit
|
||||
compatible = "phytec,am335x-pcm-953", "phytec,am335x-phycore-som", "ti,am33xx"
|
||||
|
||||
- AM335X UC-8100-ME-T: Communication-centric industrial computing platform
|
||||
compatible = "moxa,uc-8100-me-t", "ti,am33xx";
|
||||
|
||||
- OMAP5 EVM : Evaluation Module
|
||||
compatible = "ti,omap5-evm", "ti,omap5"
|
||||
|
||||
@ -184,6 +190,9 @@ Boards:
|
||||
- AM5718 IDK
|
||||
compatible = "ti,am5718-idk", "ti,am5718", "ti,dra7"
|
||||
|
||||
- DRA762 EVM: Software Development Board for DRA762
|
||||
compatible = "ti,dra76-evm", "ti,dra762", "ti,dra7"
|
||||
|
||||
- DRA742 EVM: Software Development Board for DRA742
|
||||
compatible = "ti,dra7-evm", "ti,dra742", "ti,dra74", "ti,dra7"
|
||||
|
||||
|
@ -9,9 +9,11 @@ Required properties:
|
||||
- compatible : should be one of
|
||||
"apm,potenza-pmu"
|
||||
"arm,armv8-pmuv3"
|
||||
"arm,cortex-a73-pmu"
|
||||
"arm,cortex-a72-pmu"
|
||||
"arm,cortex-a57-pmu"
|
||||
"arm,cortex-a53-pmu"
|
||||
"arm,cortex-a35-pmu"
|
||||
"arm,cortex-a17-pmu"
|
||||
"arm,cortex-a15-pmu"
|
||||
"arm,cortex-a12-pmu"
|
||||
|
@ -25,6 +25,7 @@ The 'SoC' element must be one of the following strings:
|
||||
msm8994
|
||||
msm8996
|
||||
mdm9615
|
||||
ipq8074
|
||||
|
||||
The 'board' element must be one of the following strings:
|
||||
|
||||
@ -33,6 +34,7 @@ The 'board' element must be one of the following strings:
|
||||
dragonboard
|
||||
mtp
|
||||
sbc
|
||||
hk01
|
||||
|
||||
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
|
||||
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
|
||||
|
@ -134,6 +134,10 @@ Rockchip platforms device tree bindings
|
||||
Required root node properties:
|
||||
- compatible = "phytec,rk3288-pcm-947", "phytec,rk3288-phycore-som", "rockchip,rk3288";
|
||||
|
||||
- Pine64 Rock64 board:
|
||||
Required root node properties:
|
||||
- compatible = "pine64,rock64", "rockchip,rk3328";
|
||||
|
||||
- Rockchip PX3 Evaluation board:
|
||||
Required root node properties:
|
||||
- compatible = "rockchip,px3-evb", "rockchip,px3", "rockchip,rk3188";
|
||||
@ -173,6 +177,14 @@ Rockchip platforms device tree bindings
|
||||
Required root node properties:
|
||||
- compatible = "rockchip,rk3399-evb", "rockchip,rk3399";
|
||||
|
||||
- Rockchip RK3399 Sapphire Excavator board:
|
||||
Required root node properties:
|
||||
- compatible = "rockchip,rk3399-sapphire-excavator", "rockchip,rk3399";
|
||||
|
||||
- Theobroma Systems RK3399-Q7 Haikou Baseboard:
|
||||
Required root node properties:
|
||||
- compatible = "tsd,rk3399-q7-haikou", "rockchip,rk3399";
|
||||
|
||||
- Tronsmart Orion R68 Meta
|
||||
Required root node properties:
|
||||
- compatible = "tronsmart,orion-r68-meta", "rockchip,rk3368";
|
||||
|
@ -39,6 +39,8 @@ SoCs:
|
||||
compatible = "renesas,r8a7795"
|
||||
- R-Car M3-W (R8A77960)
|
||||
compatible = "renesas,r8a7796"
|
||||
- R-Car D3 (R8A77995)
|
||||
compatible = "renesas,r8a77995"
|
||||
|
||||
|
||||
Boards:
|
||||
@ -53,6 +55,8 @@ Boards:
|
||||
compatible = "renesas,blanche", "renesas,r8a7792"
|
||||
- BOCK-W
|
||||
compatible = "renesas,bockw", "renesas,r8a7778"
|
||||
- Draak (RTP0RC77995SEB0010S)
|
||||
compatible = "renesas,draak", "renesas,r8a77995"
|
||||
- Genmai (RTK772100BC00000BR)
|
||||
compatible = "renesas,genmai", "renesas,r7s72100"
|
||||
- GR-Peach (X28A-M01-E/F)
|
||||
@ -64,6 +68,10 @@ Boards:
|
||||
compatible = "renesas,h3ulcb", "renesas,r8a7795";
|
||||
- Henninger
|
||||
compatible = "renesas,henninger", "renesas,r8a7791"
|
||||
- iWave Systems RZ/G1E SODIMM SOM Development Platform (iW-RainboW-G22D)
|
||||
compatible = "iwave,g22d", "iwave,g22m", "renesas,r8a7745"
|
||||
- iWave Systems RZ/G1E SODIMM System On Module (iW-RainboW-G22M-SM)
|
||||
compatible = "iwave,g22m", "renesas,r8a7745"
|
||||
- iWave Systems RZ/G1M Qseven Development Platform (iW-RainboW-G20D-Qseven)
|
||||
compatible = "iwave,g20d", "iwave,g20m", "renesas,r8a7743"
|
||||
- iWave Systems RZ/G1M Qseven System On Module (iW-RainboW-G20M-Qseven)
|
||||
|
51
Documentation/devicetree/bindings/ata/ahci-mtk.txt
Normal file
51
Documentation/devicetree/bindings/ata/ahci-mtk.txt
Normal file
@ -0,0 +1,51 @@
|
||||
MediaTek Serial ATA controller
|
||||
|
||||
Required properties:
|
||||
- compatible : Must be "mediatek,<chip>-ahci", "mediatek,mtk-ahci".
|
||||
When using "mediatek,mtk-ahci" compatible strings, you
|
||||
need SoC specific ones in addition, one of:
|
||||
- "mediatek,mt7622-ahci"
|
||||
- reg : Physical base addresses and length of register sets.
|
||||
- interrupts : Interrupt associated with the SATA device.
|
||||
- interrupt-names : Associated name must be: "hostc".
|
||||
- clocks : A list of phandle and clock specifier pairs, one for each
|
||||
entry in clock-names.
|
||||
- clock-names : Associated names must be: "ahb", "axi", "asic", "rbc", "pm".
|
||||
- phys : A phandle and PHY specifier pair for the PHY port.
|
||||
- phy-names : Associated name must be: "sata-phy".
|
||||
- ports-implemented : See ./ahci-platform.txt for details.
|
||||
|
||||
Optional properties:
|
||||
- power-domains : A phandle and power domain specifier pair to the power
|
||||
domain which is responsible for collapsing and restoring
|
||||
power to the peripheral.
|
||||
- resets : Must contain an entry for each entry in reset-names.
|
||||
See ../reset/reset.txt for details.
|
||||
- reset-names : Associated names must be: "axi", "sw", "reg".
|
||||
- mediatek,phy-mode : A phandle to the system controller, used to enable
|
||||
SATA function.
|
||||
|
||||
Example:
|
||||
|
||||
sata: sata@1a200000 {
|
||||
compatible = "mediatek,mt7622-ahci",
|
||||
"mediatek,mtk-ahci";
|
||||
reg = <0 0x1a200000 0 0x1100>;
|
||||
interrupts = <GIC_SPI 233 IRQ_TYPE_LEVEL_HIGH>;
|
||||
interrupt-names = "hostc";
|
||||
clocks = <&pciesys CLK_SATA_AHB_EN>,
|
||||
<&pciesys CLK_SATA_AXI_EN>,
|
||||
<&pciesys CLK_SATA_ASIC_EN>,
|
||||
<&pciesys CLK_SATA_RBC_EN>,
|
||||
<&pciesys CLK_SATA_PM_EN>;
|
||||
clock-names = "ahb", "axi", "asic", "rbc", "pm";
|
||||
phys = <&u3port1 PHY_TYPE_SATA>;
|
||||
phy-names = "sata-phy";
|
||||
ports-implemented = <0x1>;
|
||||
power-domains = <&scpsys MT7622_POWER_DOMAIN_HIF0>;
|
||||
resets = <&pciesys MT7622_SATA_AXI_BUS_RST>,
|
||||
<&pciesys MT7622_SATA_PHY_SW_RST>,
|
||||
<&pciesys MT7622_SATA_PHY_REG_RST>;
|
||||
reset-names = "axi", "sw", "reg";
|
||||
mediatek,phy-mode = <&pciesys>;
|
||||
};
|
@ -57,7 +57,6 @@ Example:
|
||||
<0x0 0x1f227000 0x0 0x1000>;
|
||||
interrupts = <0x0 0x87 0x4>;
|
||||
dma-coherent;
|
||||
status = "ok";
|
||||
clocks = <&sataclk 0>;
|
||||
phys = <&phy2 0>;
|
||||
phy-names = "sata-phy";
|
||||
@ -72,7 +71,6 @@ Example:
|
||||
<0x0 0x1f237000 0x0 0x1000>;
|
||||
interrupts = <0x0 0x88 0x4>;
|
||||
dma-coherent;
|
||||
status = "ok";
|
||||
clocks = <&sataclk 0>;
|
||||
phys = <&phy3 0>;
|
||||
phy-names = "sata-phy";
|
||||
|
@ -13,5 +13,4 @@ Example:
|
||||
reg = <0x83fe0000 0x4000>;
|
||||
interrupts = <70>;
|
||||
clocks = <&clks 161>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -227,7 +227,6 @@ See the example below, where a more complete device tree is shown:
|
||||
};
|
||||
|
||||
devbus-bootcs {
|
||||
status = "okay";
|
||||
ranges = <0 MBUS_ID(0x01, 0x2f) 0 0x8000000>;
|
||||
|
||||
/* NOR */
|
||||
@ -240,7 +239,6 @@ See the example below, where a more complete device tree is shown:
|
||||
|
||||
pcie-controller {
|
||||
compatible = "marvell,armada-xp-pcie";
|
||||
status = "okay";
|
||||
device_type = "pci";
|
||||
|
||||
#address-cells = <3>;
|
||||
@ -258,7 +256,6 @@ See the example below, where a more complete device tree is shown:
|
||||
|
||||
pcie@1,0 {
|
||||
/* Port 0, Lane 0 */
|
||||
status = "okay";
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -84,7 +84,6 @@ gmi@70090000 {
|
||||
reset-names = "gmi";
|
||||
ranges = <4 0 0xd0000000 0xfffffff>;
|
||||
|
||||
status = "okay";
|
||||
|
||||
bus@4,0 {
|
||||
compatible = "simple-bus";
|
||||
@ -121,7 +120,6 @@ gmi@70090000 {
|
||||
reset-names = "gmi";
|
||||
ranges = <4 0 0xd0000000 0xfffffff>;
|
||||
|
||||
status = "okay";
|
||||
|
||||
can@4,0 {
|
||||
reg = <4 0 0x100>;
|
||||
|
@ -33,7 +33,6 @@ Example:
|
||||
#size-cells = <1>;
|
||||
ranges = <0x702c0000 0x0 0x702c0000 0x00040000>;
|
||||
|
||||
status = "disabled";
|
||||
|
||||
child1 {
|
||||
...
|
||||
|
@ -5,9 +5,31 @@ The chosen node does not represent a real device, but serves as a place
|
||||
for passing data between firmware and the operating system, like boot
|
||||
arguments. Data in the chosen node does not represent the hardware.
|
||||
|
||||
The following properties are recognized:
|
||||
|
||||
stdout-path property
|
||||
--------------------
|
||||
|
||||
kaslr-seed
|
||||
-----------
|
||||
|
||||
This property is used when booting with CONFIG_RANDOMIZE_BASE as the
|
||||
entropy used to randomize the kernel image base address location. Since
|
||||
it is used directly, this value is intended only for KASLR, and should
|
||||
not be used for other purposes (as it may leak information about KASLR
|
||||
offsets). It is parsed as a u64 value, e.g.
|
||||
|
||||
/ {
|
||||
chosen {
|
||||
kaslr-seed = <0xfeedbeef 0xc0def00d>;
|
||||
};
|
||||
};
|
||||
|
||||
Note that if this property is set from UEFI (or a bootloader in EFI
|
||||
mode) when EFI_RNG_PROTOCOL is supported, it will be overwritten by
|
||||
the Linux EFI stub (which will populate the property itself, using
|
||||
EFI_RNG_PROTOCOL).
|
||||
|
||||
stdout-path
|
||||
-----------
|
||||
|
||||
Device trees may specify the device to be used for boot console output
|
||||
with a stdout-path property under /chosen, as described in the Devicetree
|
||||
|
@ -102,7 +102,6 @@ uart4: serial@80010000 {
|
||||
reg = <0x80010000 0x4000>;
|
||||
clocks = <&acc CLKID_SYS_UART4>, <&acc CLKID_AHB_UART4>;
|
||||
interrupts = <19>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
Clock consumer with only one, _AHB_ sink.
|
||||
|
@ -5,9 +5,11 @@ controllers within the Always-On part of the SoC.
|
||||
|
||||
Required Properties:
|
||||
|
||||
- compatible: should be "amlogic,gxbb-aoclkc"
|
||||
- reg: physical base address of the clock controller and length of memory
|
||||
mapped region.
|
||||
- compatible: value should be different for each SoC family as :
|
||||
- GXBB (S905) : "amlogic,meson-gxbb-aoclkc"
|
||||
- GXL (S905X, S905D) : "amlogic,meson-gxl-aoclkc"
|
||||
- GXM (S912) : "amlogic,meson-gxm-aoclkc"
|
||||
followed by the common "amlogic,meson-gx-aoclkc"
|
||||
|
||||
- #clock-cells: should be 1.
|
||||
|
||||
@ -23,14 +25,22 @@ to specify the reset which they consume. All available resets are defined as
|
||||
preprocessor macros in the dt-bindings/reset/gxbb-aoclkc.h header and can be
|
||||
used in device tree sources.
|
||||
|
||||
Parent node should have the following properties :
|
||||
- compatible: "amlogic,meson-gx-ao-sysctrl", "syscon", "simple-mfd"
|
||||
- reg: base address and size of the AO system control register space.
|
||||
|
||||
Example: AO Clock controller node:
|
||||
|
||||
clkc_AO: clock-controller@040 {
|
||||
compatible = "amlogic,gxbb-aoclkc";
|
||||
reg = <0x0 0x040 0x0 0x4>;
|
||||
ao_sysctrl: sys-ctrl@0 {
|
||||
compatible = "amlogic,meson-gx-ao-sysctrl", "syscon", "simple-mfd";
|
||||
reg = <0x0 0x0 0x0 0x100>;
|
||||
|
||||
clkc_AO: clock-controller {
|
||||
compatible = "amlogic,meson-gxbb-aoclkc", "amlogic,meson-gx-aoclkc";
|
||||
#clock-cells = <1>;
|
||||
#reset-cells = <1>;
|
||||
};
|
||||
};
|
||||
|
||||
Example: UART controller node that consumes the clock and reset generated
|
||||
by the clock controller:
|
||||
@ -41,5 +51,4 @@ Example: UART controller node that consumes the clock and reset generated
|
||||
interrupts = <0 90 1>;
|
||||
clocks = <&clkc_AO CLKID_AO_UART1>;
|
||||
resets = <&clkc_AO RESET_AO_UART1>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -33,5 +33,4 @@ Example: UART controller node that consumes the clock generated by the clock
|
||||
reg = <0xc81004c0 0x14>;
|
||||
interrupts = <0 90 1>;
|
||||
clocks = <&clkc CLKID_CLK81>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -16,18 +16,25 @@ Required Properties:
|
||||
mapped region.
|
||||
|
||||
- #clock-cells: should be 1.
|
||||
- #reset-cells: should be 1.
|
||||
|
||||
Each clock is assigned an identifier and client nodes can use this identifier
|
||||
to specify the clock which they consume. All available clocks are defined as
|
||||
preprocessor macros in the dt-bindings/clock/meson8b-clkc.h header and can be
|
||||
used in device tree sources.
|
||||
|
||||
Similarly a preprocessor macro for each reset line is defined in
|
||||
dt-bindings/reset/amlogic,meson8b-clkc-reset.h (which can be used from the
|
||||
device tree sources).
|
||||
|
||||
|
||||
Example: Clock controller node:
|
||||
|
||||
clkc: clock-controller@c1104000 {
|
||||
#clock-cells = <1>;
|
||||
compatible = "amlogic,meson8b-clkc";
|
||||
reg = <0xc1108000 0x4>, <0xc1104000 0x460>;
|
||||
#clock-cells = <1>;
|
||||
#reset-cells = <1>;
|
||||
};
|
||||
|
||||
|
||||
@ -39,5 +46,4 @@ Example: UART controller node that consumes the clock generated by the clock
|
||||
reg = <0xc81004c0 0x14>;
|
||||
interrupts = <0 90 1>;
|
||||
clocks = <&clkc CLKID_CLK81>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -81,6 +81,16 @@ Required properties:
|
||||
"atmel,sama5d2-clk-generated":
|
||||
at91 generated clock
|
||||
|
||||
"atmel,sama5d2-clk-audio-pll-frac":
|
||||
at91 audio fractional pll
|
||||
|
||||
"atmel,sama5d2-clk-audio-pll-pad":
|
||||
at91 audio pll CLK_AUDIO output pin
|
||||
|
||||
"atmel,sama5d2-clk-audio-pll-pmc"
|
||||
at91 audio pll output on AUDIOPLLCLK that feeds the PMC
|
||||
and can be used by peripheral clock or generic clock
|
||||
|
||||
Required properties for SCKC node:
|
||||
- reg : defines the IO memory reserved for the SCKC.
|
||||
- #size-cells : shall be 0 (reg is used to encode clk id).
|
||||
|
@ -46,7 +46,6 @@ Device tree example:
|
||||
|
||||
uart@3e002000 {
|
||||
compatible = "brcm,bcm11351-dw-apb-uart", "snps,dw-apb-uart";
|
||||
status = "disabled";
|
||||
reg = <0x3e002000 0x1000>;
|
||||
clocks = <&slave_ccu BCM281XX_SLAVE_CCU_UARTB3>;
|
||||
interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>;
|
||||
|
@ -465,5 +465,4 @@ Example 3: UART controller node that consumes the clock generated by the clock
|
||||
clock-names = "uart", "clk_uart_baud0";
|
||||
pinctrl-names = "default";
|
||||
pinctrl-0 = <&uart0_bus>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -38,5 +38,4 @@ Examples:
|
||||
clocks = <&crg_ctrl HI3660_CLK_MUX_UART0>,
|
||||
<&crg_ctrl HI3660_PCLK>;
|
||||
clock-names = "uartclk", "apb_pclk";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -27,5 +27,4 @@ Examples:
|
||||
interrupts = <0 49 4>;
|
||||
clocks = <&clock HIX5HD2_FIXED_83M>;
|
||||
clock-names = "apb_pclk";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -1,24 +1,32 @@
|
||||
Binding for IDT VersaClock5 programmable i2c clock generator.
|
||||
Binding for IDT VersaClock 5,6 programmable i2c clock generators.
|
||||
|
||||
The IDT VersaClock5 are programmable i2c clock generators providing
|
||||
from 3 to 12 output clocks.
|
||||
The IDT VersaClock 5 and VersaClock 6 are programmable i2c clock
|
||||
generators providing from 3 to 12 output clocks.
|
||||
|
||||
==I2C device node==
|
||||
|
||||
Required properties:
|
||||
- compatible: shall be one of "idt,5p49v5923" , "idt,5p49v5933" ,
|
||||
"idt,5p49v5935".
|
||||
- compatible: shall be one of
|
||||
"idt,5p49v5923"
|
||||
"idt,5p49v5925"
|
||||
"idt,5p49v5933"
|
||||
"idt,5p49v5935"
|
||||
"idt,5p49v6901"
|
||||
- reg: i2c device address, shall be 0x68 or 0x6a.
|
||||
- #clock-cells: from common clock binding; shall be set to 1.
|
||||
- clocks: from common clock binding; list of parent clock handles,
|
||||
- 5p49v5923: (required) either or both of XTAL or CLKIN
|
||||
- 5p49v5923 and
|
||||
5p49v5925 and
|
||||
5p49v6901: (required) either or both of XTAL or CLKIN
|
||||
reference clock.
|
||||
- 5p49v5933 and
|
||||
- 5p49v5935: (optional) property not present (internal
|
||||
Xtal used) or CLKIN reference
|
||||
clock.
|
||||
- clock-names: from common clock binding; clock input names, can be
|
||||
- 5p49v5923: (required) either or both of "xin", "clkin".
|
||||
- 5p49v5923 and
|
||||
5p49v5925 and
|
||||
5p49v6901: (required) either or both of "xin", "clkin".
|
||||
- 5p49v5933 and
|
||||
- 5p49v5935: (optional) property not present or "clkin".
|
||||
|
||||
@ -37,6 +45,7 @@ clock specifier, the following mapping applies:
|
||||
1 -- OUT1
|
||||
2 -- OUT4
|
||||
|
||||
5P49V5925 and
|
||||
5P49V5935:
|
||||
0 -- OUT0_SEL_I2CB
|
||||
1 -- OUT1
|
||||
@ -44,6 +53,13 @@ clock specifier, the following mapping applies:
|
||||
3 -- OUT3
|
||||
4 -- OUT4
|
||||
|
||||
5P49V6901:
|
||||
0 -- OUT0_SEL_I2CB
|
||||
1 -- OUT1
|
||||
2 -- OUT2
|
||||
3 -- OUT3
|
||||
4 -- OUT4
|
||||
|
||||
==Example==
|
||||
|
||||
/* 25MHz reference crystal */
|
||||
|
@ -24,5 +24,4 @@ Examples:
|
||||
clocks = <&clks IMX21_CLK_UART1_IPG_GATE>,
|
||||
<&clks IMX21_CLK_PER1>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -67,5 +67,4 @@ auart0: serial@8006c000 {
|
||||
reg = <0x8006c000 0x2000>;
|
||||
interrupts = <24 25 23>;
|
||||
clocks = <&clks 32>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -157,5 +157,4 @@ uart1: serial@43f90000 {
|
||||
interrupts = <45>;
|
||||
clocks = <&clks 79>, <&clks 50>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -24,5 +24,4 @@ Examples:
|
||||
clocks = <&clks IMX27_CLK_UART1_IPG_GATE>,
|
||||
<&clks IMX27_CLK_PER1_GATE>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -90,5 +90,4 @@ auart0: serial@8006a000 {
|
||||
reg = <0x8006a000 0x2000>;
|
||||
interrupts = <112 70 71>;
|
||||
clocks = <&clks 45>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -87,5 +87,4 @@ uart1: serial@43f90000 {
|
||||
interrupts = <45>;
|
||||
clocks = <&clks 10>, <&clks 30>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -25,5 +25,4 @@ can1: can@53fc8000 {
|
||||
interrupts = <82>;
|
||||
clocks = <&clks IMX5_CLK_CAN1_IPG_GATE>, <&clks IMX5_CLK_CAN1_SERIAL_GATE>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -27,5 +27,4 @@ uart1: serial@02020000 {
|
||||
interrupts = <0 26 0x04>;
|
||||
clocks = <&clks IMX6QDL_CLK_UART_IPG>, <&clks IMX6QDL_CLK_UART_SERIAL>;
|
||||
clock-names = "ipg", "per";
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -1,83 +0,0 @@
|
||||
Device Tree Clock bindins for CPU DVFS of Mediatek MT8173 SoC
|
||||
|
||||
Required properties:
|
||||
- clocks: A list of phandle + clock-specifier pairs for the clocks listed in clock names.
|
||||
- clock-names: Should contain the following:
|
||||
"cpu" - The multiplexer for clock input of CPU cluster.
|
||||
"intermediate" - A parent of "cpu" clock which is used as "intermediate" clock
|
||||
source (usually MAINPLL) when the original CPU PLL is under
|
||||
transition and not stable yet.
|
||||
Please refer to Documentation/devicetree/bindings/clk/clock-bindings.txt for
|
||||
generic clock consumer properties.
|
||||
- proc-supply: Regulator for Vproc of CPU cluster.
|
||||
|
||||
Optional properties:
|
||||
- sram-supply: Regulator for Vsram of CPU cluster. When present, the cpufreq driver
|
||||
needs to do "voltage tracking" to step by step scale up/down Vproc and
|
||||
Vsram to fit SoC specific needs. When absent, the voltage scaling
|
||||
flow is handled by hardware, hence no software "voltage tracking" is
|
||||
needed.
|
||||
|
||||
Example:
|
||||
--------
|
||||
cpu0: cpu@0 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x000>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_SLEEP_0>;
|
||||
clocks = <&infracfg CLK_INFRA_CA53SEL>,
|
||||
<&apmixedsys CLK_APMIXED_MAINPLL>;
|
||||
clock-names = "cpu", "intermediate";
|
||||
};
|
||||
|
||||
cpu1: cpu@1 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a53";
|
||||
reg = <0x001>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_SLEEP_0>;
|
||||
clocks = <&infracfg CLK_INFRA_CA53SEL>,
|
||||
<&apmixedsys CLK_APMIXED_MAINPLL>;
|
||||
clock-names = "cpu", "intermediate";
|
||||
};
|
||||
|
||||
cpu2: cpu@100 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x100>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_SLEEP_0>;
|
||||
clocks = <&infracfg CLK_INFRA_CA57SEL>,
|
||||
<&apmixedsys CLK_APMIXED_MAINPLL>;
|
||||
clock-names = "cpu", "intermediate";
|
||||
};
|
||||
|
||||
cpu3: cpu@101 {
|
||||
device_type = "cpu";
|
||||
compatible = "arm,cortex-a57";
|
||||
reg = <0x101>;
|
||||
enable-method = "psci";
|
||||
cpu-idle-states = <&CPU_SLEEP_0>;
|
||||
clocks = <&infracfg CLK_INFRA_CA57SEL>,
|
||||
<&apmixedsys CLK_APMIXED_MAINPLL>;
|
||||
clock-names = "cpu", "intermediate";
|
||||
};
|
||||
|
||||
&cpu0 {
|
||||
proc-supply = <&mt6397_vpca15_reg>;
|
||||
};
|
||||
|
||||
&cpu1 {
|
||||
proc-supply = <&mt6397_vpca15_reg>;
|
||||
};
|
||||
|
||||
&cpu2 {
|
||||
proc-supply = <&da9211_vcpu_reg>;
|
||||
sram-supply = <&mt6397_vsramca7_reg>;
|
||||
};
|
||||
|
||||
&cpu3 {
|
||||
proc-supply = <&da9211_vcpu_reg>;
|
||||
sram-supply = <&mt6397_vsramca7_reg>;
|
||||
};
|
@ -66,7 +66,6 @@ clock@70110000 {
|
||||
#clock-cells = <0>;
|
||||
clock-output-names = "dfllCPU_out";
|
||||
vdd-cpu-supply = <&vdd_cpu>;
|
||||
status = "okay";
|
||||
|
||||
nvidia,sample-rate = <12500>;
|
||||
nvidia,droop-ctrl = <0x00000f00>;
|
||||
|
@ -12,5 +12,4 @@ Examples:
|
||||
pxa2xx_clks: pxa2xx_clks@41300004 {
|
||||
compatible = "marvell,pxa-clocks";
|
||||
#clock-cells = <1>;
|
||||
status = "okay";
|
||||
};
|
||||
|
@ -22,6 +22,7 @@ Required Properties:
|
||||
- "renesas,r8a7794-cpg-mssr" for the r8a7794 SoC (R-Car E2)
|
||||
- "renesas,r8a7795-cpg-mssr" for the r8a7795 SoC (R-Car H3)
|
||||
- "renesas,r8a7796-cpg-mssr" for the r8a7796 SoC (R-Car M3-W)
|
||||
- "renesas,r8a77995-cpg-mssr" for the r8a77995 SoC (R-Car D3)
|
||||
|
||||
- reg: Base address and length of the memory resource used by the CPG/MSSR
|
||||
block
|
||||
@ -30,7 +31,7 @@ Required Properties:
|
||||
clock-names
|
||||
- clock-names: List of external parent clock names. Valid names are:
|
||||
- "extal" (r8a7743, r8a7745, r8a7790, r8a7791, r8a7792, r8a7793, r8a7794,
|
||||
r8a7795, r8a7796)
|
||||
r8a7795, r8a7796, r8a77995)
|
||||
- "extalr" (r8a7795, r8a7796)
|
||||
- "usb_extal" (r8a7743, r8a7745, r8a7790, r8a7791, r8a7793, r8a7794)
|
||||
|
||||
@ -81,5 +82,4 @@ Examples
|
||||
dma-names = "tx", "rx";
|
||||
power-domains = <&cpg>;
|
||||
resets = <&cpg 310>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -44,5 +44,4 @@ Examples
|
||||
interrupts = <0 87 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&mstp3_clks R8A7778_CLK_SDHI0>;
|
||||
power-domains = <&cpg_clocks>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -0,0 +1,55 @@
|
||||
* Renesas R-Car USB 2.0 clock selector
|
||||
|
||||
This file provides information on what the device node for the R-Car USB 2.0
|
||||
clock selector.
|
||||
|
||||
If you connect an external clock to the USB_EXTAL pin only, you should set
|
||||
the clock rate to "usb_extal" node only.
|
||||
If you connect an oscillator to both the USB_XTAL and USB_EXTAL, this module
|
||||
is not needed because this is default setting. (Of course, you can set the
|
||||
clock rates to both "usb_extal" and "usb_xtal" nodes.
|
||||
|
||||
Case 1: An external clock connects to R-Car SoC
|
||||
+----------+ +--- R-Car ---------------------+
|
||||
|External |---|USB_EXTAL ---> all usb channels|
|
||||
|clock | |USB_XTAL |
|
||||
+----------+ +-------------------------------+
|
||||
In this case, we need this driver with "usb_extal" clock.
|
||||
|
||||
Case 2: An oscillator connects to R-Car SoC
|
||||
+----------+ +--- R-Car ---------------------+
|
||||
|Oscillator|---|USB_EXTAL -+-> all usb channels|
|
||||
| |---|USB_XTAL --+ |
|
||||
+----------+ +-------------------------------+
|
||||
In this case, we don't need this selector.
|
||||
|
||||
Required properties:
|
||||
- compatible: "renesas,r8a7795-rcar-usb2-clock-sel" if the device is a part of
|
||||
an R8A7795 SoC.
|
||||
"renesas,r8a7796-rcar-usb2-clock-sel" if the device if a part of
|
||||
an R8A7796 SoC.
|
||||
"renesas,rcar-gen3-usb2-clock-sel" for a generic R-Car Gen3
|
||||
compatible device.
|
||||
|
||||
When compatible with the generic version, nodes must list the
|
||||
SoC-specific version corresponding to the platform first
|
||||
followed by the generic version.
|
||||
|
||||
- reg: offset and length of the USB 2.0 clock selector register block.
|
||||
- clocks: A list of phandles and specifier pairs.
|
||||
- clock-names: Name of the clocks.
|
||||
- The functional clock must be "ehci_ohci"
|
||||
- The USB_EXTAL clock pin must be "usb_extal"
|
||||
- The USB_XTAL clock pin must be "usb_xtal"
|
||||
- #clock-cells: Must be 0
|
||||
|
||||
Example (R-Car H3):
|
||||
|
||||
usb2_clksel: clock-controller@e6590630 {
|
||||
compatible = "renesas,r8a77950-rcar-usb2-clock-sel",
|
||||
"renesas,rcar-gen3-usb2-clock-sel";
|
||||
reg = <0 0xe6590630 0 0x02>;
|
||||
clocks = <&cpg CPG_MOD 703>, <&usb_extal>, <&usb_xtal>;
|
||||
clock-names = "ehci_ohci", "usb_extal", "usb_xtal";
|
||||
#clock-cells = <0>;
|
||||
};
|
@ -50,5 +50,4 @@ Examples
|
||||
clocks = <&mstp3_clks R7S72100_CLK_MTU2>;
|
||||
clock-names = "fck";
|
||||
power-domains = <&cpg_clocks>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
@ -1,12 +1,14 @@
|
||||
* Rockchip RK3128 Clock and Reset Unit
|
||||
* Rockchip RK3126/RK3128 Clock and Reset Unit
|
||||
|
||||
The RK3128 clock controller generates and supplies clock to various
|
||||
The RK3126/RK3128 clock controller generates and supplies clock to various
|
||||
controllers within the SoC and also implements a reset controller for SoC
|
||||
peripherals.
|
||||
|
||||
Required Properties:
|
||||
|
||||
- compatible: should be "rockchip,rk3128-cru"
|
||||
- compatible: should be "rockchip,rk3126-cru" or "rockchip,rk3128-cru"
|
||||
"rockchip,rk3126-cru" - controller compatible with RK3126 SoC.
|
||||
"rockchip,rk3128-cru" - controller compatible with RK3128 SoC.
|
||||
- reg: physical base address of the controller and length of memory mapped
|
||||
region.
|
||||
- #clock-cells: should be 1.
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user