mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
libnvdimm for 4.5
1/ Media error handling: The 'badblocks' implementation that originated in md-raid is up-levelled to a generic capability of a block device. This initial implementation is limited to being consulted in the pmem block-i/o path. Later, 'badblocks' will be consulted when creating dax mappings. 2/ Raw block device dax: For virtualization and other cases that want large contiguous mappings of persistent memory, add the capability to dax-mmap a block device directly. 3/ Increased /dev/mem restrictions: Add an option to treat all io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access while a driver is actively using an address range. This behavior is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be overridden by the existing "iomem=relaxed" kernel command line option. 4/ Miscellaneous fixes include a 'pfn'-device huge page alignment fix, block device shutdown crash fix, and other small libnvdimm fixes. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJWlrhjAAoJEB7SkWpmfYgCFbAQALKsQfFwT6JFS+zlPgiNpbqw 2VMNKEH0AfGYGj96mT02j2q+vSUmXLMIDMTsbe0sDdtwFZtQbFmhmryzPWUVppSu KGTlLPW8vuEhQVs91+UI3BQKkvpi0+tbR8hPOh9W6QhjpRT+lyHFKnsNR5HZy5wB K4/VMaT5ffd5/pXRTjkYiPQYTwWyfcvNjICj0YtqhPvOwS031m77JpFsWJ8HSpEX K99VlzNUPMXd1pYkHmFNXWw52fhRGNhwAEomLeKMdQfKms+KnbKp8BOSA0aCqU8E kpujQcilDXJwykFQZOFI3Z5Dxvrv8lxFTU8HRMBvo3ESzfTWjfqcvyjGOjDUcruw ihESFSJtdZzhrBiMnf9RRqSpMFJvAT8MVT6Q4D3mZUHCMPbUqFJsQjMPt9hEH3ho 4F0D2lesOCkubUKFTZmjMoDb+szuKbVhYK8TeFVVEhizinc/Aj0NKuazJqi+CXB/ xh0ER4ZxD8wvzqFFWvS5UvR1G9I5fr7+3jGRUrqGLHlSdeXP9dkEg28ao3QbWk3x 1dPOen6ZqQ9WJ/E7eGmXbVEz2R4Xd79hMXQzdQwmKDk/KbxRoAp7hyU8BslAyrBf HCdmVt+RAgrxZYfFRXuLhqwEBThJnNrgZA3qu74FUpkpFg6xRUu1bAYBiF7N+bFi 82b5UbMkveBTtkXjJoiR =7V5r -----END PGP SIGNATURE----- Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm updates from Dan Williams: "The bulk of this has appeared in -next and independently received a build success notification from the kbuild robot. The 'for-4.5/block- dax' topic branch was rebased over the weekend to drop the "block device end-of-life" rework that Al would like to see re-implemented with a notifier, and to address bug reports against the badblocks integration. There is pending feedback against "libnvdimm: Add a poison list and export badblocks" received last week. Linda identified some localized fixups that we will handle incrementally. Summary: - Media error handling: The 'badblocks' implementation that originated in md-raid is up-levelled to a generic capability of a block device. This initial implementation is limited to being consulted in the pmem block-i/o path. Later, 'badblocks' will be consulted when creating dax mappings. - Raw block device dax: For virtualization and other cases that want large contiguous mappings of persistent memory, add the capability to dax-mmap a block device directly. - Increased /dev/mem restrictions: Add an option to treat all io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access while a driver is actively using an address range. This behavior is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be overridden by the existing "iomem=relaxed" kernel command line option. - Miscellaneous fixes include a 'pfn'-device huge page alignment fix, block device shutdown crash fix, and other small libnvdimm fixes" * tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits) block: kill disk_{check|set|clear|alloc}_badblocks libnvdimm, pmem: nvdimm_read_bytes() badblocks support pmem, dax: disable dax in the presence of bad blocks pmem: fail io-requests to known bad blocks libnvdimm: convert to statically allocated badblocks libnvdimm: don't fail init for full badblocks list block, badblocks: introduce devm_init_badblocks block: clarify badblocks lifetime badblocks: rename badblocks_free to badblocks_exit libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h libnvdimm: Add a poison list and export badblocks nfit_test: Enable DSMs for all test NFITs md: convert to use the generic badblocks code block: Add badblock management for gendisks badblocks: Add core badblock management code block: fix del_gendisk() vs blkdev_ioctl crash block: enable dax for raw block devices block: introduce bdev_file_inode() restrict /dev/mem to idle io memory ranges arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug ...
This commit is contained in:
commit
d080827f85
@ -2,6 +2,7 @@ config ARM
|
||||
bool
|
||||
default y
|
||||
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
|
||||
select ARCH_HAVE_CUSTOM_GPIO_H
|
||||
|
@ -15,20 +15,6 @@ config ARM_PTDUMP
|
||||
kernel.
|
||||
If in doubt, say "N"
|
||||
|
||||
config STRICT_DEVMEM
|
||||
bool "Filter access to /dev/mem"
|
||||
depends on MMU
|
||||
---help---
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
of memory, including kernel and userspace memory. Accidental
|
||||
access to this is obviously disastrous, but specific access can
|
||||
be used by people debugging the kernel.
|
||||
|
||||
If this option is switched on, the /dev/mem file only allows
|
||||
userspace access to memory mapped peripherals.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
||||
# RMK wants arm kernels compiled with frame pointers or stack unwinding.
|
||||
# If you know what you are doing and are willing to live without stack
|
||||
# traces, you can get a slightly smaller kernel by setting this option to
|
||||
|
@ -3,6 +3,7 @@ config ARM64
|
||||
select ACPI_CCA_REQUIRED if ACPI
|
||||
select ACPI_GENERIC_GSI if ACPI
|
||||
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
||||
|
@ -14,20 +14,6 @@ config ARM64_PTDUMP
|
||||
kernel.
|
||||
If in doubt, say "N"
|
||||
|
||||
config STRICT_DEVMEM
|
||||
bool "Filter access to /dev/mem"
|
||||
depends on MMU
|
||||
help
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
of memory, including kernel and userspace memory. Accidental
|
||||
access to this is obviously disastrous, but specific access can
|
||||
be used by people debugging the kernel.
|
||||
|
||||
If this option is switched on, the /dev/mem file only allows
|
||||
userspace access to memory mapped peripherals.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
||||
config PID_IN_CONTEXTIDR
|
||||
bool "Write the current PID to the CONTEXTIDR register"
|
||||
help
|
||||
|
@ -10,6 +10,7 @@ config FRV
|
||||
select HAVE_DEBUG_BUGVERBOSE
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
select GENERIC_CPU_DEVICES
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select OLD_SIGSUSPEND3
|
||||
select OLD_SIGACTION
|
||||
|
@ -13,6 +13,7 @@ config M32R
|
||||
select GENERIC_IRQ_PROBE
|
||||
select GENERIC_IRQ_SHOW
|
||||
select GENERIC_ATOMIC64
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_USES_GETTIMEOFFSET
|
||||
select MODULES_USE_ELF_RELA
|
||||
select HAVE_DEBUG_STACKOVERFLOW
|
||||
|
@ -159,6 +159,7 @@ config PPC
|
||||
select EDAC_SUPPORT
|
||||
select EDAC_ATOMIC_SCRUB
|
||||
select ARCH_HAS_DMA_SET_COHERENT_MASK
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select HAVE_ARCH_SECCOMP_FILTER
|
||||
|
||||
config GENERIC_CSUM
|
||||
|
@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR
|
||||
platform probing is done, all platforms selected must
|
||||
share the same address.
|
||||
|
||||
config STRICT_DEVMEM
|
||||
def_bool y
|
||||
prompt "Filter access to /dev/mem"
|
||||
help
|
||||
This option restricts access to /dev/mem. If this option is
|
||||
disabled, you allow userspace access to all memory, including
|
||||
kernel and userspace memory. Accidental memory access is likely
|
||||
to be disastrous.
|
||||
Memory access is required for experts who want to debug the kernel.
|
||||
|
||||
If you are unsure, say Y.
|
||||
|
||||
config FAIL_IOMMU
|
||||
bool "Fault-injection capability for IOMMU"
|
||||
depends on FAULT_INJECTION
|
||||
|
@ -66,6 +66,7 @@ config S390
|
||||
def_bool y
|
||||
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
|
||||
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
||||
select ARCH_HAS_SG_CHAIN
|
||||
|
@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
|
||||
|
||||
source "lib/Kconfig.debug"
|
||||
|
||||
config STRICT_DEVMEM
|
||||
def_bool y
|
||||
prompt "Filter access to /dev/mem"
|
||||
---help---
|
||||
This option restricts access to /dev/mem. If this option is
|
||||
disabled, you allow userspace access to all memory, including
|
||||
kernel and userspace memory. Accidental memory access is likely
|
||||
to be disastrous.
|
||||
Memory access is required for experts who want to debug the kernel.
|
||||
|
||||
If you are unsure, say Y.
|
||||
|
||||
config S390_PTDUMP
|
||||
bool "Export kernel pagetable layout to userspace via debugfs"
|
||||
depends on DEBUG_KERNEL
|
||||
|
@ -19,6 +19,7 @@ config TILE
|
||||
select VIRT_TO_BUS
|
||||
select SYS_HYPERVISOR
|
||||
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
select GENERIC_CLOCKEVENTS
|
||||
select MODULES_USE_ELF_RELA
|
||||
@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT
|
||||
config TRACE_IRQFLAGS_SUPPORT
|
||||
def_bool y
|
||||
|
||||
config STRICT_DEVMEM
|
||||
def_bool y
|
||||
|
||||
# SMP is required for Tilera Linux.
|
||||
config SMP
|
||||
def_bool y
|
||||
|
@ -1,5 +1,6 @@
|
||||
config UNICORE32
|
||||
def_bool y
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_MIGHT_HAVE_PC_PARPORT
|
||||
select ARCH_MIGHT_HAVE_PC_SERIO
|
||||
select HAVE_MEMBLOCK
|
||||
|
@ -2,20 +2,6 @@ menu "Kernel hacking"
|
||||
|
||||
source "lib/Kconfig.debug"
|
||||
|
||||
config STRICT_DEVMEM
|
||||
bool "Filter access to /dev/mem"
|
||||
depends on MMU
|
||||
---help---
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
of memory, including kernel and userspace memory. Accidental
|
||||
access to this is obviously disastrous, but specific access can
|
||||
be used by people debugging the kernel.
|
||||
|
||||
If this option is switched on, the /dev/mem file only allows
|
||||
userspace access to memory mapped peripherals.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
||||
config EARLY_PRINTK
|
||||
def_bool DEBUG_OCD
|
||||
help
|
||||
|
@ -24,6 +24,7 @@ config X86
|
||||
select ARCH_DISCARD_MEMBLOCK
|
||||
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
|
||||
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
|
||||
select ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
select ARCH_HAS_FAST_MULTIPLIER
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
||||
|
@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
|
||||
|
||||
source "lib/Kconfig.debug"
|
||||
|
||||
config STRICT_DEVMEM
|
||||
bool "Filter access to /dev/mem"
|
||||
---help---
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
of memory, including kernel and userspace memory. Accidental
|
||||
access to this is obviously disastrous, but specific access can
|
||||
be used by people debugging the kernel. Note that with PAT support
|
||||
enabled, even in this case there are restrictions on /dev/mem
|
||||
use due to the cache aliasing requirements.
|
||||
|
||||
If this option is switched on, the /dev/mem file only allows
|
||||
userspace access to PCI space and the BIOS code and data regions.
|
||||
This is sufficient for dosemu and X and all common users of
|
||||
/dev/mem.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
||||
config X86_VERBOSE_BOOTUP
|
||||
bool "Enable verbose x86 bootup info messages"
|
||||
default y
|
||||
|
@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
|
||||
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
|
||||
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
|
||||
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
|
||||
partitions/
|
||||
badblocks.o partitions/
|
||||
|
||||
obj-$(CONFIG_BOUNCE) += bounce.o
|
||||
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
|
||||
|
585
block/badblocks.c
Normal file
585
block/badblocks.c
Normal file
@ -0,0 +1,585 @@
|
||||
/*
|
||||
* Bad block management
|
||||
*
|
||||
* - Heavily based on MD badblocks code from Neil Brown
|
||||
*
|
||||
* Copyright (c) 2015, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
/**
|
||||
* badblocks_check() - check a given range for bad sectors
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @s: sector (start) at which to check for badblocks
|
||||
* @sectors: number of sectors to check for badblocks
|
||||
* @first_bad: pointer to store location of the first badblock
|
||||
* @bad_sectors: pointer to store number of badblocks after @first_bad
|
||||
*
|
||||
* We can record which blocks on each device are 'bad' and so just
|
||||
* fail those blocks, or that stripe, rather than the whole device.
|
||||
* Entries in the bad-block table are 64bits wide. This comprises:
|
||||
* Length of bad-range, in sectors: 0-511 for lengths 1-512
|
||||
* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
|
||||
* A 'shift' can be set so that larger blocks are tracked and
|
||||
* consequently larger devices can be covered.
|
||||
* 'Acknowledged' flag - 1 bit. - the most significant bit.
|
||||
*
|
||||
* Locking of the bad-block table uses a seqlock so badblocks_check
|
||||
* might need to retry if it is very unlucky.
|
||||
* We will sometimes want to check for bad blocks in a bi_end_io function,
|
||||
* so we use the write_seqlock_irq variant.
|
||||
*
|
||||
* When looking for a bad block we specify a range and want to
|
||||
* know if any block in the range is bad. So we binary-search
|
||||
* to the last range that starts at-or-before the given endpoint,
|
||||
* (or "before the sector after the target range")
|
||||
* then see if it ends after the given start.
|
||||
*
|
||||
* Return:
|
||||
* 0: there are no known bad blocks in the range
|
||||
* 1: there are known bad block which are all acknowledged
|
||||
* -1: there are bad blocks which have not yet been acknowledged in metadata.
|
||||
* plus the start/length of the first bad section we overlap.
|
||||
*/
|
||||
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors)
|
||||
{
|
||||
int hi;
|
||||
int lo;
|
||||
u64 *p = bb->page;
|
||||
int rv;
|
||||
sector_t target = s + sectors;
|
||||
unsigned seq;
|
||||
|
||||
if (bb->shift > 0) {
|
||||
/* round the start down, and the end up */
|
||||
s >>= bb->shift;
|
||||
target += (1<<bb->shift) - 1;
|
||||
target >>= bb->shift;
|
||||
sectors = target - s;
|
||||
}
|
||||
/* 'target' is now the first block after the bad range */
|
||||
|
||||
retry:
|
||||
seq = read_seqbegin(&bb->lock);
|
||||
lo = 0;
|
||||
rv = 0;
|
||||
hi = bb->count;
|
||||
|
||||
/* Binary search between lo and hi for 'target'
|
||||
* i.e. for the last range that starts before 'target'
|
||||
*/
|
||||
/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
|
||||
* are known not to be the last range before target.
|
||||
* VARIANT: hi-lo is the number of possible
|
||||
* ranges, and decreases until it reaches 1
|
||||
*/
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
|
||||
if (a < target)
|
||||
/* This could still be the one, earlier ranges
|
||||
* could not.
|
||||
*/
|
||||
lo = mid;
|
||||
else
|
||||
/* This and later ranges are definitely out. */
|
||||
hi = mid;
|
||||
}
|
||||
/* 'lo' might be the last that started before target, but 'hi' isn't */
|
||||
if (hi > lo) {
|
||||
/* need to check all range that end after 's' to see if
|
||||
* any are unacknowledged.
|
||||
*/
|
||||
while (lo >= 0 &&
|
||||
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
||||
if (BB_OFFSET(p[lo]) < target) {
|
||||
/* starts before the end, and finishes after
|
||||
* the start, so they must overlap
|
||||
*/
|
||||
if (rv != -1 && BB_ACK(p[lo]))
|
||||
rv = 1;
|
||||
else
|
||||
rv = -1;
|
||||
*first_bad = BB_OFFSET(p[lo]);
|
||||
*bad_sectors = BB_LEN(p[lo]);
|
||||
}
|
||||
lo--;
|
||||
}
|
||||
}
|
||||
|
||||
if (read_seqretry(&bb->lock, seq))
|
||||
goto retry;
|
||||
|
||||
return rv;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_check);
|
||||
|
||||
/**
|
||||
* badblocks_set() - Add a range of bad blocks to the table.
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @s: first sector to mark as bad
|
||||
* @sectors: number of sectors to mark as bad
|
||||
* @acknowledged: weather to mark the bad sectors as acknowledged
|
||||
*
|
||||
* This might extend the table, or might contract it if two adjacent ranges
|
||||
* can be merged. We binary-search to find the 'insertion' point, then
|
||||
* decide how best to handle it.
|
||||
*
|
||||
* Return:
|
||||
* 0: success
|
||||
* 1: failed to set badblocks (out of space)
|
||||
*/
|
||||
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
|
||||
int acknowledged)
|
||||
{
|
||||
u64 *p;
|
||||
int lo, hi;
|
||||
int rv = 0;
|
||||
unsigned long flags;
|
||||
|
||||
if (bb->shift < 0)
|
||||
/* badblocks are disabled */
|
||||
return 0;
|
||||
|
||||
if (bb->shift) {
|
||||
/* round the start down, and the end up */
|
||||
sector_t next = s + sectors;
|
||||
|
||||
s >>= bb->shift;
|
||||
next += (1<<bb->shift) - 1;
|
||||
next >>= bb->shift;
|
||||
sectors = next - s;
|
||||
}
|
||||
|
||||
write_seqlock_irqsave(&bb->lock, flags);
|
||||
|
||||
p = bb->page;
|
||||
lo = 0;
|
||||
hi = bb->count;
|
||||
/* Find the last range that starts at-or-before 's' */
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
|
||||
if (a <= s)
|
||||
lo = mid;
|
||||
else
|
||||
hi = mid;
|
||||
}
|
||||
if (hi > lo && BB_OFFSET(p[lo]) > s)
|
||||
hi = lo;
|
||||
|
||||
if (hi > lo) {
|
||||
/* we found a range that might merge with the start
|
||||
* of our new range
|
||||
*/
|
||||
sector_t a = BB_OFFSET(p[lo]);
|
||||
sector_t e = a + BB_LEN(p[lo]);
|
||||
int ack = BB_ACK(p[lo]);
|
||||
|
||||
if (e >= s) {
|
||||
/* Yes, we can merge with a previous range */
|
||||
if (s == a && s + sectors >= e)
|
||||
/* new range covers old */
|
||||
ack = acknowledged;
|
||||
else
|
||||
ack = ack && acknowledged;
|
||||
|
||||
if (e < s + sectors)
|
||||
e = s + sectors;
|
||||
if (e - a <= BB_MAX_LEN) {
|
||||
p[lo] = BB_MAKE(a, e-a, ack);
|
||||
s = e;
|
||||
} else {
|
||||
/* does not all fit in one range,
|
||||
* make p[lo] maximal
|
||||
*/
|
||||
if (BB_LEN(p[lo]) != BB_MAX_LEN)
|
||||
p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
|
||||
s = a + BB_MAX_LEN;
|
||||
}
|
||||
sectors = e - s;
|
||||
}
|
||||
}
|
||||
if (sectors && hi < bb->count) {
|
||||
/* 'hi' points to the first range that starts after 's'.
|
||||
* Maybe we can merge with the start of that range
|
||||
*/
|
||||
sector_t a = BB_OFFSET(p[hi]);
|
||||
sector_t e = a + BB_LEN(p[hi]);
|
||||
int ack = BB_ACK(p[hi]);
|
||||
|
||||
if (a <= s + sectors) {
|
||||
/* merging is possible */
|
||||
if (e <= s + sectors) {
|
||||
/* full overlap */
|
||||
e = s + sectors;
|
||||
ack = acknowledged;
|
||||
} else
|
||||
ack = ack && acknowledged;
|
||||
|
||||
a = s;
|
||||
if (e - a <= BB_MAX_LEN) {
|
||||
p[hi] = BB_MAKE(a, e-a, ack);
|
||||
s = e;
|
||||
} else {
|
||||
p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
|
||||
s = a + BB_MAX_LEN;
|
||||
}
|
||||
sectors = e - s;
|
||||
lo = hi;
|
||||
hi++;
|
||||
}
|
||||
}
|
||||
if (sectors == 0 && hi < bb->count) {
|
||||
/* we might be able to combine lo and hi */
|
||||
/* Note: 's' is at the end of 'lo' */
|
||||
sector_t a = BB_OFFSET(p[hi]);
|
||||
int lolen = BB_LEN(p[lo]);
|
||||
int hilen = BB_LEN(p[hi]);
|
||||
int newlen = lolen + hilen - (s - a);
|
||||
|
||||
if (s >= a && newlen < BB_MAX_LEN) {
|
||||
/* yes, we can combine them */
|
||||
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
|
||||
|
||||
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
|
||||
memmove(p + hi, p + hi + 1,
|
||||
(bb->count - hi - 1) * 8);
|
||||
bb->count--;
|
||||
}
|
||||
}
|
||||
while (sectors) {
|
||||
/* didn't merge (it all).
|
||||
* Need to add a range just before 'hi'
|
||||
*/
|
||||
if (bb->count >= MAX_BADBLOCKS) {
|
||||
/* No room for more */
|
||||
rv = 1;
|
||||
break;
|
||||
} else {
|
||||
int this_sectors = sectors;
|
||||
|
||||
memmove(p + hi + 1, p + hi,
|
||||
(bb->count - hi) * 8);
|
||||
bb->count++;
|
||||
|
||||
if (this_sectors > BB_MAX_LEN)
|
||||
this_sectors = BB_MAX_LEN;
|
||||
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
|
||||
sectors -= this_sectors;
|
||||
s += this_sectors;
|
||||
}
|
||||
}
|
||||
|
||||
bb->changed = 1;
|
||||
if (!acknowledged)
|
||||
bb->unacked_exist = 1;
|
||||
write_sequnlock_irqrestore(&bb->lock, flags);
|
||||
|
||||
return rv;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_set);
|
||||
|
||||
/**
|
||||
* badblocks_clear() - Remove a range of bad blocks to the table.
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @s: first sector to mark as bad
|
||||
* @sectors: number of sectors to mark as bad
|
||||
*
|
||||
* This may involve extending the table if we spilt a region,
|
||||
* but it must not fail. So if the table becomes full, we just
|
||||
* drop the remove request.
|
||||
*
|
||||
* Return:
|
||||
* 0: success
|
||||
* 1: failed to clear badblocks
|
||||
*/
|
||||
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
|
||||
{
|
||||
u64 *p;
|
||||
int lo, hi;
|
||||
sector_t target = s + sectors;
|
||||
int rv = 0;
|
||||
|
||||
if (bb->shift > 0) {
|
||||
/* When clearing we round the start up and the end down.
|
||||
* This should not matter as the shift should align with
|
||||
* the block size and no rounding should ever be needed.
|
||||
* However it is better the think a block is bad when it
|
||||
* isn't than to think a block is not bad when it is.
|
||||
*/
|
||||
s += (1<<bb->shift) - 1;
|
||||
s >>= bb->shift;
|
||||
target >>= bb->shift;
|
||||
sectors = target - s;
|
||||
}
|
||||
|
||||
write_seqlock_irq(&bb->lock);
|
||||
|
||||
p = bb->page;
|
||||
lo = 0;
|
||||
hi = bb->count;
|
||||
/* Find the last range that starts before 'target' */
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
|
||||
if (a < target)
|
||||
lo = mid;
|
||||
else
|
||||
hi = mid;
|
||||
}
|
||||
if (hi > lo) {
|
||||
/* p[lo] is the last range that could overlap the
|
||||
* current range. Earlier ranges could also overlap,
|
||||
* but only this one can overlap the end of the range.
|
||||
*/
|
||||
if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
|
||||
/* Partial overlap, leave the tail of this range */
|
||||
int ack = BB_ACK(p[lo]);
|
||||
sector_t a = BB_OFFSET(p[lo]);
|
||||
sector_t end = a + BB_LEN(p[lo]);
|
||||
|
||||
if (a < s) {
|
||||
/* we need to split this range */
|
||||
if (bb->count >= MAX_BADBLOCKS) {
|
||||
rv = -ENOSPC;
|
||||
goto out;
|
||||
}
|
||||
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
|
||||
bb->count++;
|
||||
p[lo] = BB_MAKE(a, s-a, ack);
|
||||
lo++;
|
||||
}
|
||||
p[lo] = BB_MAKE(target, end - target, ack);
|
||||
/* there is no longer an overlap */
|
||||
hi = lo;
|
||||
lo--;
|
||||
}
|
||||
while (lo >= 0 &&
|
||||
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
||||
/* This range does overlap */
|
||||
if (BB_OFFSET(p[lo]) < s) {
|
||||
/* Keep the early parts of this range. */
|
||||
int ack = BB_ACK(p[lo]);
|
||||
sector_t start = BB_OFFSET(p[lo]);
|
||||
|
||||
p[lo] = BB_MAKE(start, s - start, ack);
|
||||
/* now low doesn't overlap, so.. */
|
||||
break;
|
||||
}
|
||||
lo--;
|
||||
}
|
||||
/* 'lo' is strictly before, 'hi' is strictly after,
|
||||
* anything between needs to be discarded
|
||||
*/
|
||||
if (hi - lo > 1) {
|
||||
memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
|
||||
bb->count -= (hi - lo - 1);
|
||||
}
|
||||
}
|
||||
|
||||
bb->changed = 1;
|
||||
out:
|
||||
write_sequnlock_irq(&bb->lock);
|
||||
return rv;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_clear);
|
||||
|
||||
/**
|
||||
* ack_all_badblocks() - Acknowledge all bad blocks in a list.
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
*
|
||||
* This only succeeds if ->changed is clear. It is used by
|
||||
* in-kernel metadata updates
|
||||
*/
|
||||
void ack_all_badblocks(struct badblocks *bb)
|
||||
{
|
||||
if (bb->page == NULL || bb->changed)
|
||||
/* no point even trying */
|
||||
return;
|
||||
write_seqlock_irq(&bb->lock);
|
||||
|
||||
if (bb->changed == 0 && bb->unacked_exist) {
|
||||
u64 *p = bb->page;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < bb->count ; i++) {
|
||||
if (!BB_ACK(p[i])) {
|
||||
sector_t start = BB_OFFSET(p[i]);
|
||||
int len = BB_LEN(p[i]);
|
||||
|
||||
p[i] = BB_MAKE(start, len, 1);
|
||||
}
|
||||
}
|
||||
bb->unacked_exist = 0;
|
||||
}
|
||||
write_sequnlock_irq(&bb->lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ack_all_badblocks);
|
||||
|
||||
/**
|
||||
* badblocks_show() - sysfs access to bad-blocks list
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @page: buffer received from sysfs
|
||||
* @unack: weather to show unacknowledged badblocks
|
||||
*
|
||||
* Return:
|
||||
* Length of returned data
|
||||
*/
|
||||
ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
|
||||
{
|
||||
size_t len;
|
||||
int i;
|
||||
u64 *p = bb->page;
|
||||
unsigned seq;
|
||||
|
||||
if (bb->shift < 0)
|
||||
return 0;
|
||||
|
||||
retry:
|
||||
seq = read_seqbegin(&bb->lock);
|
||||
|
||||
len = 0;
|
||||
i = 0;
|
||||
|
||||
while (len < PAGE_SIZE && i < bb->count) {
|
||||
sector_t s = BB_OFFSET(p[i]);
|
||||
unsigned int length = BB_LEN(p[i]);
|
||||
int ack = BB_ACK(p[i]);
|
||||
|
||||
i++;
|
||||
|
||||
if (unack && ack)
|
||||
continue;
|
||||
|
||||
len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
|
||||
(unsigned long long)s << bb->shift,
|
||||
length << bb->shift);
|
||||
}
|
||||
if (unack && len == 0)
|
||||
bb->unacked_exist = 0;
|
||||
|
||||
if (read_seqretry(&bb->lock, seq))
|
||||
goto retry;
|
||||
|
||||
return len;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_show);
|
||||
|
||||
/**
|
||||
* badblocks_store() - sysfs access to bad-blocks list
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @page: buffer received from sysfs
|
||||
* @len: length of data received from sysfs
|
||||
* @unack: weather to show unacknowledged badblocks
|
||||
*
|
||||
* Return:
|
||||
* Length of the buffer processed or -ve error.
|
||||
*/
|
||||
ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
|
||||
int unack)
|
||||
{
|
||||
unsigned long long sector;
|
||||
int length;
|
||||
char newline;
|
||||
|
||||
switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) {
|
||||
case 3:
|
||||
if (newline != '\n')
|
||||
return -EINVAL;
|
||||
case 2:
|
||||
if (length <= 0)
|
||||
return -EINVAL;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (badblocks_set(bb, sector, length, !unack))
|
||||
return -ENOSPC;
|
||||
else
|
||||
return len;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_store);
|
||||
|
||||
static int __badblocks_init(struct device *dev, struct badblocks *bb,
|
||||
int enable)
|
||||
{
|
||||
bb->dev = dev;
|
||||
bb->count = 0;
|
||||
if (enable)
|
||||
bb->shift = 0;
|
||||
else
|
||||
bb->shift = -1;
|
||||
if (dev)
|
||||
bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
|
||||
else
|
||||
bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!bb->page) {
|
||||
bb->shift = -1;
|
||||
return -ENOMEM;
|
||||
}
|
||||
seqlock_init(&bb->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* badblocks_init() - initialize the badblocks structure
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
* @enable: weather to enable badblocks accounting
|
||||
*
|
||||
* Return:
|
||||
* 0: success
|
||||
* -ve errno: on error
|
||||
*/
|
||||
int badblocks_init(struct badblocks *bb, int enable)
|
||||
{
|
||||
return __badblocks_init(NULL, bb, enable);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_init);
|
||||
|
||||
int devm_init_badblocks(struct device *dev, struct badblocks *bb)
|
||||
{
|
||||
if (!bb)
|
||||
return -EINVAL;
|
||||
return __badblocks_init(dev, bb, 1);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(devm_init_badblocks);
|
||||
|
||||
/**
|
||||
* badblocks_exit() - free the badblocks structure
|
||||
* @bb: the badblocks structure that holds all badblock information
|
||||
*/
|
||||
void badblocks_exit(struct badblocks *bb)
|
||||
{
|
||||
if (!bb)
|
||||
return;
|
||||
if (bb->dev)
|
||||
devm_kfree(bb->dev, bb->page);
|
||||
else
|
||||
kfree(bb->page);
|
||||
bb->page = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(badblocks_exit);
|
@ -20,6 +20,7 @@
|
||||
#include <linux/idr.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
#include <linux/badblocks.h>
|
||||
|
||||
#include "blk.h"
|
||||
|
||||
@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
|
||||
|
||||
kobject_put(disk->part0.holder_dir);
|
||||
kobject_put(disk->slave_dir);
|
||||
disk->driverfs_dev = NULL;
|
||||
if (!sysfs_deprecated)
|
||||
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
|
||||
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
|
||||
@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
|
||||
}
|
||||
EXPORT_SYMBOL(del_gendisk);
|
||||
|
||||
/* sysfs access to bad-blocks list. */
|
||||
static ssize_t disk_badblocks_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
|
||||
if (!disk->bb)
|
||||
return sprintf(page, "\n");
|
||||
|
||||
return badblocks_show(disk->bb, page, 0);
|
||||
}
|
||||
|
||||
static ssize_t disk_badblocks_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *page, size_t len)
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
|
||||
if (!disk->bb)
|
||||
return -ENXIO;
|
||||
|
||||
return badblocks_store(disk->bb, page, len, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* get_gendisk - get partitioning information for a given device
|
||||
* @devt: device to get partitioning information for
|
||||
@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
|
||||
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
|
||||
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
|
||||
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
|
||||
static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
|
||||
disk_badblocks_store);
|
||||
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
||||
static struct device_attribute dev_attr_fail =
|
||||
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
|
||||
@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
|
||||
&dev_attr_capability.attr,
|
||||
&dev_attr_stat.attr,
|
||||
&dev_attr_inflight.attr,
|
||||
&dev_attr_badblocks.attr,
|
||||
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
||||
&dev_attr_fail.attr,
|
||||
#endif
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/blkpg.h>
|
||||
#include <linux/hdreg.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blktrace_api.h>
|
||||
@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
|
||||
ret == -ENOIOCTLCMD;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
bool blkdev_dax_capable(struct block_device *bdev)
|
||||
{
|
||||
struct gendisk *disk = bdev->bd_disk;
|
||||
|
||||
if (!disk->fops->direct_access)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If the partition is not aligned on a page boundary, we can't
|
||||
* do dax I/O to it.
|
||||
*/
|
||||
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
|
||||
|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If the device has known bad blocks, force all I/O through the
|
||||
* driver / page cache.
|
||||
*
|
||||
* TODO: support finer grained dax error handling
|
||||
*/
|
||||
if (disk->bb && disk->bb->count)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
|
||||
{
|
||||
unsigned long arg;
|
||||
int rc = 0;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
||||
if (get_user(arg, (int __user *)(argp)))
|
||||
return -EFAULT;
|
||||
arg = !!arg;
|
||||
if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
|
||||
return 0;
|
||||
|
||||
if (arg)
|
||||
arg = S_DAX;
|
||||
|
||||
if (arg && !blkdev_dax_capable(bdev))
|
||||
return -ENOTTY;
|
||||
|
||||
mutex_lock(&bdev->bd_inode->i_mutex);
|
||||
if (bdev->bd_map_count == 0)
|
||||
inode_set_flags(bdev->bd_inode, arg, S_DAX);
|
||||
else
|
||||
rc = -EBUSY;
|
||||
mutex_unlock(&bdev->bd_inode->i_mutex);
|
||||
return rc;
|
||||
}
|
||||
#else
|
||||
static int blkdev_daxset(struct block_device *bdev, int arg)
|
||||
{
|
||||
if (arg)
|
||||
return -ENOTTY;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
|
||||
unsigned cmd, unsigned long arg)
|
||||
{
|
||||
@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
|
||||
case BLKTRACESETUP:
|
||||
case BLKTRACETEARDOWN:
|
||||
return blk_trace_ioctl(bdev, cmd, argp);
|
||||
case BLKDAXSET:
|
||||
return blkdev_daxset(bdev, arg);
|
||||
case BLKDAXGET:
|
||||
return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
|
||||
break;
|
||||
case IOC_PR_REGISTER:
|
||||
return blkdev_pr_register(bdev, argp);
|
||||
case IOC_PR_RESERVE:
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/ndctl.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/sort.h>
|
||||
@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
|
||||
/* devm will free nfit_blk */
|
||||
}
|
||||
|
||||
static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
|
||||
struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
|
||||
{
|
||||
cmd->address = addr;
|
||||
cmd->length = length;
|
||||
|
||||
return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
|
||||
sizeof(*cmd));
|
||||
}
|
||||
|
||||
static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
|
||||
struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
|
||||
{
|
||||
int rc;
|
||||
|
||||
cmd->address = addr;
|
||||
cmd->length = length;
|
||||
cmd->type = ND_ARS_PERSISTENT;
|
||||
|
||||
while (1) {
|
||||
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
|
||||
sizeof(*cmd));
|
||||
if (rc)
|
||||
return rc;
|
||||
switch (cmd->status) {
|
||||
case 0:
|
||||
return 0;
|
||||
case 1:
|
||||
/* ARS unsupported, but we should never get here */
|
||||
return 0;
|
||||
case 2:
|
||||
return -EINVAL;
|
||||
case 3:
|
||||
/* ARS is in progress */
|
||||
msleep(1000);
|
||||
break;
|
||||
default:
|
||||
return -ENXIO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
|
||||
struct nd_cmd_ars_status *cmd)
|
||||
{
|
||||
int rc;
|
||||
|
||||
while (1) {
|
||||
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
|
||||
sizeof(*cmd));
|
||||
if (rc || cmd->status & 0xffff)
|
||||
return -ENXIO;
|
||||
|
||||
/* Check extended status (Upper two bytes) */
|
||||
switch (cmd->status >> 16) {
|
||||
case 0:
|
||||
return 0;
|
||||
case 1:
|
||||
/* ARS is in progress */
|
||||
msleep(1000);
|
||||
break;
|
||||
case 2:
|
||||
/* No ARS performed for the current boot */
|
||||
return 0;
|
||||
default:
|
||||
return -ENXIO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
|
||||
struct nd_cmd_ars_status *ars_status, u64 start)
|
||||
{
|
||||
int rc;
|
||||
u32 i;
|
||||
|
||||
/*
|
||||
* The address field returned by ars_status should be either
|
||||
* less than or equal to the address we last started ARS for.
|
||||
* The (start, length) returned by ars_status should also have
|
||||
* non-zero overlap with the range we started ARS for.
|
||||
* If this is not the case, bail.
|
||||
*/
|
||||
if (ars_status->address > start ||
|
||||
(ars_status->address + ars_status->length < start))
|
||||
return -ENXIO;
|
||||
|
||||
for (i = 0; i < ars_status->num_records; i++) {
|
||||
rc = nvdimm_bus_add_poison(nvdimm_bus,
|
||||
ars_status->records[i].err_address,
|
||||
ars_status->records[i].length);
|
||||
if (rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
|
||||
struct nd_region_desc *ndr_desc)
|
||||
{
|
||||
struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
|
||||
struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
|
||||
struct nd_cmd_ars_status *ars_status = NULL;
|
||||
struct nd_cmd_ars_start *ars_start = NULL;
|
||||
struct nd_cmd_ars_cap *ars_cap = NULL;
|
||||
u64 start, len, cur, remaining;
|
||||
int rc;
|
||||
|
||||
ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
|
||||
if (!ars_cap)
|
||||
return -ENOMEM;
|
||||
|
||||
start = ndr_desc->res->start;
|
||||
len = ndr_desc->res->end - ndr_desc->res->start + 1;
|
||||
|
||||
rc = ars_get_cap(nd_desc, ars_cap, start, len);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
|
||||
* extended status is not set, skip this but continue initialization
|
||||
*/
|
||||
if ((ars_cap->status & 0xffff) ||
|
||||
!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
|
||||
dev_warn(acpi_desc->dev,
|
||||
"ARS unsupported (status: 0x%x), won't create an error list\n",
|
||||
ars_cap->status);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a full-range ARS has been run. If so, use those results
|
||||
* without having to start a new ARS.
|
||||
*/
|
||||
ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
|
||||
GFP_KERNEL);
|
||||
if (!ars_status) {
|
||||
rc = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = ars_get_status(nd_desc, ars_status);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
if (ars_status->address <= start &&
|
||||
(ars_status->address + ars_status->length >= start + len)) {
|
||||
rc = ars_status_process_records(nvdimm_bus, ars_status, start);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* ARS_STATUS can overflow if the number of poison entries found is
|
||||
* greater than the maximum buffer size (ars_cap->max_ars_out)
|
||||
* To detect overflow, check if the length field of ars_status
|
||||
* is less than the length we supplied. If so, process the
|
||||
* error entries we got, adjust the start point, and start again
|
||||
*/
|
||||
ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
|
||||
if (!ars_start)
|
||||
return -ENOMEM;
|
||||
|
||||
cur = start;
|
||||
remaining = len;
|
||||
do {
|
||||
u64 done, end;
|
||||
|
||||
rc = ars_do_start(nd_desc, ars_start, cur, remaining);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
rc = ars_get_status(nd_desc, ars_status);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
end = min(cur + remaining,
|
||||
ars_status->address + ars_status->length);
|
||||
done = end - cur;
|
||||
cur += done;
|
||||
remaining -= done;
|
||||
} while (remaining);
|
||||
|
||||
out:
|
||||
kfree(ars_cap);
|
||||
kfree(ars_start);
|
||||
kfree(ars_status);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
|
||||
struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
|
||||
struct acpi_nfit_memory_map *memdev,
|
||||
@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
|
||||
|
||||
nvdimm_bus = acpi_desc->nvdimm_bus;
|
||||
if (nfit_spa_type(spa) == NFIT_SPA_PM) {
|
||||
rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
|
||||
if (rc) {
|
||||
dev_err(acpi_desc->dev,
|
||||
"error while performing ARS to find poison: %d\n",
|
||||
rc);
|
||||
return rc;
|
||||
}
|
||||
if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
|
||||
return -ENOMEM;
|
||||
} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
|
||||
|
516
drivers/md/md.c
516
drivers/md/md.c
@ -34,6 +34,7 @@
|
||||
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/fs.h>
|
||||
@ -710,8 +711,7 @@ void md_rdev_clear(struct md_rdev *rdev)
|
||||
put_page(rdev->bb_page);
|
||||
rdev->bb_page = NULL;
|
||||
}
|
||||
kfree(rdev->badblocks.page);
|
||||
rdev->badblocks.page = NULL;
|
||||
badblocks_exit(&rdev->badblocks);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_rdev_clear);
|
||||
|
||||
@ -1361,8 +1361,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
|
||||
return cpu_to_le32(csum);
|
||||
}
|
||||
|
||||
static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
|
||||
int acknowledged);
|
||||
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
|
||||
{
|
||||
struct mdp_superblock_1 *sb;
|
||||
@ -1487,8 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
|
||||
count <<= sb->bblog_shift;
|
||||
if (bb + 1 == 0)
|
||||
break;
|
||||
if (md_set_badblocks(&rdev->badblocks,
|
||||
sector, count, 1) == 0)
|
||||
if (badblocks_set(&rdev->badblocks, sector, count, 1))
|
||||
return -EINVAL;
|
||||
}
|
||||
} else if (sb->bblog_offset != 0)
|
||||
@ -2320,7 +2317,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (rdev->badblocks.changed) {
|
||||
rdev->badblocks.changed = 0;
|
||||
md_ack_all_badblocks(&rdev->badblocks);
|
||||
ack_all_badblocks(&rdev->badblocks);
|
||||
md_error(mddev, rdev);
|
||||
}
|
||||
clear_bit(Blocked, &rdev->flags);
|
||||
@ -2446,7 +2443,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
|
||||
clear_bit(Blocked, &rdev->flags);
|
||||
|
||||
if (any_badblocks_changed)
|
||||
md_ack_all_badblocks(&rdev->badblocks);
|
||||
ack_all_badblocks(&rdev->badblocks);
|
||||
clear_bit(BlockedBadBlocks, &rdev->flags);
|
||||
wake_up(&rdev->blocked_wait);
|
||||
}
|
||||
@ -3054,11 +3051,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
|
||||
static struct rdev_sysfs_entry rdev_recovery_start =
|
||||
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
|
||||
|
||||
static ssize_t
|
||||
badblocks_show(struct badblocks *bb, char *page, int unack);
|
||||
static ssize_t
|
||||
badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
|
||||
|
||||
/* sysfs access to bad-blocks list.
|
||||
* We present two files.
|
||||
* 'bad-blocks' lists sector numbers and lengths of ranges that
|
||||
* are recorded as bad. The list is truncated to fit within
|
||||
* the one-page limit of sysfs.
|
||||
* Writing "sector length" to this file adds an acknowledged
|
||||
* bad block list.
|
||||
* 'unacknowledged-bad-blocks' lists bad blocks that have not yet
|
||||
* been acknowledged. Writing to this file adds bad blocks
|
||||
* without acknowledging them. This is largely for testing.
|
||||
*/
|
||||
static ssize_t bb_show(struct md_rdev *rdev, char *page)
|
||||
{
|
||||
return badblocks_show(&rdev->badblocks, page, 0);
|
||||
@ -3173,14 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev)
|
||||
* This reserves the space even on arrays where it cannot
|
||||
* be used - I wonder if that matters
|
||||
*/
|
||||
rdev->badblocks.count = 0;
|
||||
rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
|
||||
rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
seqlock_init(&rdev->badblocks.lock);
|
||||
if (rdev->badblocks.page == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
return badblocks_init(&rdev->badblocks, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_rdev_init);
|
||||
/*
|
||||
@ -8489,254 +8485,9 @@ void md_finish_reshape(struct mddev *mddev)
|
||||
}
|
||||
EXPORT_SYMBOL(md_finish_reshape);
|
||||
|
||||
/* Bad block management.
|
||||
* We can record which blocks on each device are 'bad' and so just
|
||||
* fail those blocks, or that stripe, rather than the whole device.
|
||||
* Entries in the bad-block table are 64bits wide. This comprises:
|
||||
* Length of bad-range, in sectors: 0-511 for lengths 1-512
|
||||
* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
|
||||
* A 'shift' can be set so that larger blocks are tracked and
|
||||
* consequently larger devices can be covered.
|
||||
* 'Acknowledged' flag - 1 bit. - the most significant bit.
|
||||
*
|
||||
* Locking of the bad-block table uses a seqlock so md_is_badblock
|
||||
* might need to retry if it is very unlucky.
|
||||
* We will sometimes want to check for bad blocks in a bi_end_io function,
|
||||
* so we use the write_seqlock_irq variant.
|
||||
*
|
||||
* When looking for a bad block we specify a range and want to
|
||||
* know if any block in the range is bad. So we binary-search
|
||||
* to the last range that starts at-or-before the given endpoint,
|
||||
* (or "before the sector after the target range")
|
||||
* then see if it ends after the given start.
|
||||
* We return
|
||||
* 0 if there are no known bad blocks in the range
|
||||
* 1 if there are known bad block which are all acknowledged
|
||||
* -1 if there are bad blocks which have not yet been acknowledged in metadata.
|
||||
* plus the start/length of the first bad section we overlap.
|
||||
*/
|
||||
int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors)
|
||||
{
|
||||
int hi;
|
||||
int lo;
|
||||
u64 *p = bb->page;
|
||||
int rv;
|
||||
sector_t target = s + sectors;
|
||||
unsigned seq;
|
||||
|
||||
if (bb->shift > 0) {
|
||||
/* round the start down, and the end up */
|
||||
s >>= bb->shift;
|
||||
target += (1<<bb->shift) - 1;
|
||||
target >>= bb->shift;
|
||||
sectors = target - s;
|
||||
}
|
||||
/* 'target' is now the first block after the bad range */
|
||||
|
||||
retry:
|
||||
seq = read_seqbegin(&bb->lock);
|
||||
lo = 0;
|
||||
rv = 0;
|
||||
hi = bb->count;
|
||||
|
||||
/* Binary search between lo and hi for 'target'
|
||||
* i.e. for the last range that starts before 'target'
|
||||
*/
|
||||
/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
|
||||
* are known not to be the last range before target.
|
||||
* VARIANT: hi-lo is the number of possible
|
||||
* ranges, and decreases until it reaches 1
|
||||
*/
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
if (a < target)
|
||||
/* This could still be the one, earlier ranges
|
||||
* could not. */
|
||||
lo = mid;
|
||||
else
|
||||
/* This and later ranges are definitely out. */
|
||||
hi = mid;
|
||||
}
|
||||
/* 'lo' might be the last that started before target, but 'hi' isn't */
|
||||
if (hi > lo) {
|
||||
/* need to check all range that end after 's' to see if
|
||||
* any are unacknowledged.
|
||||
*/
|
||||
while (lo >= 0 &&
|
||||
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
||||
if (BB_OFFSET(p[lo]) < target) {
|
||||
/* starts before the end, and finishes after
|
||||
* the start, so they must overlap
|
||||
*/
|
||||
if (rv != -1 && BB_ACK(p[lo]))
|
||||
rv = 1;
|
||||
else
|
||||
rv = -1;
|
||||
*first_bad = BB_OFFSET(p[lo]);
|
||||
*bad_sectors = BB_LEN(p[lo]);
|
||||
}
|
||||
lo--;
|
||||
}
|
||||
}
|
||||
|
||||
if (read_seqretry(&bb->lock, seq))
|
||||
goto retry;
|
||||
|
||||
return rv;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_is_badblock);
|
||||
|
||||
/*
|
||||
* Add a range of bad blocks to the table.
|
||||
* This might extend the table, or might contract it
|
||||
* if two adjacent ranges can be merged.
|
||||
* We binary-search to find the 'insertion' point, then
|
||||
* decide how best to handle it.
|
||||
*/
|
||||
static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
|
||||
int acknowledged)
|
||||
{
|
||||
u64 *p;
|
||||
int lo, hi;
|
||||
int rv = 1;
|
||||
unsigned long flags;
|
||||
|
||||
if (bb->shift < 0)
|
||||
/* badblocks are disabled */
|
||||
return 0;
|
||||
|
||||
if (bb->shift) {
|
||||
/* round the start down, and the end up */
|
||||
sector_t next = s + sectors;
|
||||
s >>= bb->shift;
|
||||
next += (1<<bb->shift) - 1;
|
||||
next >>= bb->shift;
|
||||
sectors = next - s;
|
||||
}
|
||||
|
||||
write_seqlock_irqsave(&bb->lock, flags);
|
||||
|
||||
p = bb->page;
|
||||
lo = 0;
|
||||
hi = bb->count;
|
||||
/* Find the last range that starts at-or-before 's' */
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
if (a <= s)
|
||||
lo = mid;
|
||||
else
|
||||
hi = mid;
|
||||
}
|
||||
if (hi > lo && BB_OFFSET(p[lo]) > s)
|
||||
hi = lo;
|
||||
|
||||
if (hi > lo) {
|
||||
/* we found a range that might merge with the start
|
||||
* of our new range
|
||||
*/
|
||||
sector_t a = BB_OFFSET(p[lo]);
|
||||
sector_t e = a + BB_LEN(p[lo]);
|
||||
int ack = BB_ACK(p[lo]);
|
||||
if (e >= s) {
|
||||
/* Yes, we can merge with a previous range */
|
||||
if (s == a && s + sectors >= e)
|
||||
/* new range covers old */
|
||||
ack = acknowledged;
|
||||
else
|
||||
ack = ack && acknowledged;
|
||||
|
||||
if (e < s + sectors)
|
||||
e = s + sectors;
|
||||
if (e - a <= BB_MAX_LEN) {
|
||||
p[lo] = BB_MAKE(a, e-a, ack);
|
||||
s = e;
|
||||
} else {
|
||||
/* does not all fit in one range,
|
||||
* make p[lo] maximal
|
||||
*/
|
||||
if (BB_LEN(p[lo]) != BB_MAX_LEN)
|
||||
p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
|
||||
s = a + BB_MAX_LEN;
|
||||
}
|
||||
sectors = e - s;
|
||||
}
|
||||
}
|
||||
if (sectors && hi < bb->count) {
|
||||
/* 'hi' points to the first range that starts after 's'.
|
||||
* Maybe we can merge with the start of that range */
|
||||
sector_t a = BB_OFFSET(p[hi]);
|
||||
sector_t e = a + BB_LEN(p[hi]);
|
||||
int ack = BB_ACK(p[hi]);
|
||||
if (a <= s + sectors) {
|
||||
/* merging is possible */
|
||||
if (e <= s + sectors) {
|
||||
/* full overlap */
|
||||
e = s + sectors;
|
||||
ack = acknowledged;
|
||||
} else
|
||||
ack = ack && acknowledged;
|
||||
|
||||
a = s;
|
||||
if (e - a <= BB_MAX_LEN) {
|
||||
p[hi] = BB_MAKE(a, e-a, ack);
|
||||
s = e;
|
||||
} else {
|
||||
p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
|
||||
s = a + BB_MAX_LEN;
|
||||
}
|
||||
sectors = e - s;
|
||||
lo = hi;
|
||||
hi++;
|
||||
}
|
||||
}
|
||||
if (sectors == 0 && hi < bb->count) {
|
||||
/* we might be able to combine lo and hi */
|
||||
/* Note: 's' is at the end of 'lo' */
|
||||
sector_t a = BB_OFFSET(p[hi]);
|
||||
int lolen = BB_LEN(p[lo]);
|
||||
int hilen = BB_LEN(p[hi]);
|
||||
int newlen = lolen + hilen - (s - a);
|
||||
if (s >= a && newlen < BB_MAX_LEN) {
|
||||
/* yes, we can combine them */
|
||||
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
|
||||
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
|
||||
memmove(p + hi, p + hi + 1,
|
||||
(bb->count - hi - 1) * 8);
|
||||
bb->count--;
|
||||
}
|
||||
}
|
||||
while (sectors) {
|
||||
/* didn't merge (it all).
|
||||
* Need to add a range just before 'hi' */
|
||||
if (bb->count >= MD_MAX_BADBLOCKS) {
|
||||
/* No room for more */
|
||||
rv = 0;
|
||||
break;
|
||||
} else {
|
||||
int this_sectors = sectors;
|
||||
memmove(p + hi + 1, p + hi,
|
||||
(bb->count - hi) * 8);
|
||||
bb->count++;
|
||||
|
||||
if (this_sectors > BB_MAX_LEN)
|
||||
this_sectors = BB_MAX_LEN;
|
||||
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
|
||||
sectors -= this_sectors;
|
||||
s += this_sectors;
|
||||
}
|
||||
}
|
||||
|
||||
bb->changed = 1;
|
||||
if (!acknowledged)
|
||||
bb->unacked_exist = 1;
|
||||
write_sequnlock_irqrestore(&bb->lock, flags);
|
||||
|
||||
return rv;
|
||||
}
|
||||
/* Bad block management */
|
||||
|
||||
/* Returns 1 on success, 0 on failure */
|
||||
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
int is_new)
|
||||
{
|
||||
@ -8745,114 +8496,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
s += rdev->new_data_offset;
|
||||
else
|
||||
s += rdev->data_offset;
|
||||
rv = md_set_badblocks(&rdev->badblocks,
|
||||
s, sectors, 0);
|
||||
if (rv) {
|
||||
rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
|
||||
if (rv == 0) {
|
||||
/* Make sure they get written out promptly */
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
|
||||
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
|
||||
md_wakeup_thread(rdev->mddev->thread);
|
||||
}
|
||||
return rv;
|
||||
return 1;
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
|
||||
|
||||
/*
|
||||
* Remove a range of bad blocks from the table.
|
||||
* This may involve extending the table if we spilt a region,
|
||||
* but it must not fail. So if the table becomes full, we just
|
||||
* drop the remove request.
|
||||
*/
|
||||
static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
|
||||
{
|
||||
u64 *p;
|
||||
int lo, hi;
|
||||
sector_t target = s + sectors;
|
||||
int rv = 0;
|
||||
|
||||
if (bb->shift > 0) {
|
||||
/* When clearing we round the start up and the end down.
|
||||
* This should not matter as the shift should align with
|
||||
* the block size and no rounding should ever be needed.
|
||||
* However it is better the think a block is bad when it
|
||||
* isn't than to think a block is not bad when it is.
|
||||
*/
|
||||
s += (1<<bb->shift) - 1;
|
||||
s >>= bb->shift;
|
||||
target >>= bb->shift;
|
||||
sectors = target - s;
|
||||
}
|
||||
|
||||
write_seqlock_irq(&bb->lock);
|
||||
|
||||
p = bb->page;
|
||||
lo = 0;
|
||||
hi = bb->count;
|
||||
/* Find the last range that starts before 'target' */
|
||||
while (hi - lo > 1) {
|
||||
int mid = (lo + hi) / 2;
|
||||
sector_t a = BB_OFFSET(p[mid]);
|
||||
if (a < target)
|
||||
lo = mid;
|
||||
else
|
||||
hi = mid;
|
||||
}
|
||||
if (hi > lo) {
|
||||
/* p[lo] is the last range that could overlap the
|
||||
* current range. Earlier ranges could also overlap,
|
||||
* but only this one can overlap the end of the range.
|
||||
*/
|
||||
if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
|
||||
/* Partial overlap, leave the tail of this range */
|
||||
int ack = BB_ACK(p[lo]);
|
||||
sector_t a = BB_OFFSET(p[lo]);
|
||||
sector_t end = a + BB_LEN(p[lo]);
|
||||
|
||||
if (a < s) {
|
||||
/* we need to split this range */
|
||||
if (bb->count >= MD_MAX_BADBLOCKS) {
|
||||
rv = -ENOSPC;
|
||||
goto out;
|
||||
}
|
||||
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
|
||||
bb->count++;
|
||||
p[lo] = BB_MAKE(a, s-a, ack);
|
||||
lo++;
|
||||
}
|
||||
p[lo] = BB_MAKE(target, end - target, ack);
|
||||
/* there is no longer an overlap */
|
||||
hi = lo;
|
||||
lo--;
|
||||
}
|
||||
while (lo >= 0 &&
|
||||
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
||||
/* This range does overlap */
|
||||
if (BB_OFFSET(p[lo]) < s) {
|
||||
/* Keep the early parts of this range. */
|
||||
int ack = BB_ACK(p[lo]);
|
||||
sector_t start = BB_OFFSET(p[lo]);
|
||||
p[lo] = BB_MAKE(start, s - start, ack);
|
||||
/* now low doesn't overlap, so.. */
|
||||
break;
|
||||
}
|
||||
lo--;
|
||||
}
|
||||
/* 'lo' is strictly before, 'hi' is strictly after,
|
||||
* anything between needs to be discarded
|
||||
*/
|
||||
if (hi - lo > 1) {
|
||||
memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
|
||||
bb->count -= (hi - lo - 1);
|
||||
}
|
||||
}
|
||||
|
||||
bb->changed = 1;
|
||||
out:
|
||||
write_sequnlock_irq(&bb->lock);
|
||||
return rv;
|
||||
}
|
||||
|
||||
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
int is_new)
|
||||
{
|
||||
@ -8860,133 +8516,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
s += rdev->new_data_offset;
|
||||
else
|
||||
s += rdev->data_offset;
|
||||
return md_clear_badblocks(&rdev->badblocks,
|
||||
return badblocks_clear(&rdev->badblocks,
|
||||
s, sectors);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
|
||||
|
||||
/*
|
||||
* Acknowledge all bad blocks in a list.
|
||||
* This only succeeds if ->changed is clear. It is used by
|
||||
* in-kernel metadata updates
|
||||
*/
|
||||
void md_ack_all_badblocks(struct badblocks *bb)
|
||||
{
|
||||
if (bb->page == NULL || bb->changed)
|
||||
/* no point even trying */
|
||||
return;
|
||||
write_seqlock_irq(&bb->lock);
|
||||
|
||||
if (bb->changed == 0 && bb->unacked_exist) {
|
||||
u64 *p = bb->page;
|
||||
int i;
|
||||
for (i = 0; i < bb->count ; i++) {
|
||||
if (!BB_ACK(p[i])) {
|
||||
sector_t start = BB_OFFSET(p[i]);
|
||||
int len = BB_LEN(p[i]);
|
||||
p[i] = BB_MAKE(start, len, 1);
|
||||
}
|
||||
}
|
||||
bb->unacked_exist = 0;
|
||||
}
|
||||
write_sequnlock_irq(&bb->lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
|
||||
|
||||
/* sysfs access to bad-blocks list.
|
||||
* We present two files.
|
||||
* 'bad-blocks' lists sector numbers and lengths of ranges that
|
||||
* are recorded as bad. The list is truncated to fit within
|
||||
* the one-page limit of sysfs.
|
||||
* Writing "sector length" to this file adds an acknowledged
|
||||
* bad block list.
|
||||
* 'unacknowledged-bad-blocks' lists bad blocks that have not yet
|
||||
* been acknowledged. Writing to this file adds bad blocks
|
||||
* without acknowledging them. This is largely for testing.
|
||||
*/
|
||||
|
||||
static ssize_t
|
||||
badblocks_show(struct badblocks *bb, char *page, int unack)
|
||||
{
|
||||
size_t len;
|
||||
int i;
|
||||
u64 *p = bb->page;
|
||||
unsigned seq;
|
||||
|
||||
if (bb->shift < 0)
|
||||
return 0;
|
||||
|
||||
retry:
|
||||
seq = read_seqbegin(&bb->lock);
|
||||
|
||||
len = 0;
|
||||
i = 0;
|
||||
|
||||
while (len < PAGE_SIZE && i < bb->count) {
|
||||
sector_t s = BB_OFFSET(p[i]);
|
||||
unsigned int length = BB_LEN(p[i]);
|
||||
int ack = BB_ACK(p[i]);
|
||||
i++;
|
||||
|
||||
if (unack && ack)
|
||||
continue;
|
||||
|
||||
len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
|
||||
(unsigned long long)s << bb->shift,
|
||||
length << bb->shift);
|
||||
}
|
||||
if (unack && len == 0)
|
||||
bb->unacked_exist = 0;
|
||||
|
||||
if (read_seqretry(&bb->lock, seq))
|
||||
goto retry;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
#define DO_DEBUG 1
|
||||
|
||||
static ssize_t
|
||||
badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
|
||||
{
|
||||
unsigned long long sector;
|
||||
int length;
|
||||
char newline;
|
||||
#ifdef DO_DEBUG
|
||||
/* Allow clearing via sysfs *only* for testing/debugging.
|
||||
* Normally only a successful write may clear a badblock
|
||||
*/
|
||||
int clear = 0;
|
||||
if (page[0] == '-') {
|
||||
clear = 1;
|
||||
page++;
|
||||
}
|
||||
#endif /* DO_DEBUG */
|
||||
|
||||
switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) {
|
||||
case 3:
|
||||
if (newline != '\n')
|
||||
return -EINVAL;
|
||||
case 2:
|
||||
if (length <= 0)
|
||||
return -EINVAL;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#ifdef DO_DEBUG
|
||||
if (clear) {
|
||||
md_clear_badblocks(bb, sector, length);
|
||||
return len;
|
||||
}
|
||||
#endif /* DO_DEBUG */
|
||||
if (md_set_badblocks(bb, sector, length, !unack))
|
||||
return len;
|
||||
else
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
static int md_notify_reboot(struct notifier_block *this,
|
||||
unsigned long code, void *x)
|
||||
{
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mm.h>
|
||||
@ -28,13 +29,6 @@
|
||||
|
||||
#define MaxSector (~(sector_t)0)
|
||||
|
||||
/* Bad block numbers are stored sorted in a single page.
|
||||
* 64bits is used for each block or extent.
|
||||
* 54 bits are sector number, 9 bits are extent size,
|
||||
* 1 bit is an 'acknowledged' flag.
|
||||
*/
|
||||
#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
|
||||
|
||||
/*
|
||||
* MD's 'extended' device
|
||||
*/
|
||||
@ -117,22 +111,7 @@ struct md_rdev {
|
||||
struct kernfs_node *sysfs_state; /* handle for 'state'
|
||||
* sysfs entry */
|
||||
|
||||
struct badblocks {
|
||||
int count; /* count of bad blocks */
|
||||
int unacked_exist; /* there probably are unacknowledged
|
||||
* bad blocks. This is only cleared
|
||||
* when a read discovers none
|
||||
*/
|
||||
int shift; /* shift from sectors to block size
|
||||
* a -ve shift means badblocks are
|
||||
* disabled.*/
|
||||
u64 *page; /* badblock list */
|
||||
int changed;
|
||||
seqlock_t lock;
|
||||
|
||||
sector_t sector;
|
||||
sector_t size; /* in sectors */
|
||||
} badblocks;
|
||||
struct badblocks badblocks;
|
||||
};
|
||||
enum flag_bits {
|
||||
Faulty, /* device is known to have a fault */
|
||||
@ -185,22 +164,11 @@ enum flag_bits {
|
||||
*/
|
||||
};
|
||||
|
||||
#define BB_LEN_MASK (0x00000000000001FFULL)
|
||||
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
|
||||
#define BB_ACK_MASK (0x8000000000000000ULL)
|
||||
#define BB_MAX_LEN 512
|
||||
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
|
||||
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
|
||||
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
|
||||
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
|
||||
|
||||
extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors);
|
||||
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors)
|
||||
{
|
||||
if (unlikely(rdev->badblocks.count)) {
|
||||
int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
|
||||
int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
|
||||
sectors,
|
||||
first_bad, bad_sectors);
|
||||
if (rv)
|
||||
@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
int is_new);
|
||||
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
int is_new);
|
||||
extern void md_ack_all_badblocks(struct badblocks *bb);
|
||||
|
||||
struct md_cluster_info;
|
||||
|
||||
struct mddev {
|
||||
|
@ -11,6 +11,7 @@
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/libnvdimm.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/blkdev.h>
|
||||
@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
|
||||
if (!nvdimm_bus)
|
||||
return NULL;
|
||||
INIT_LIST_HEAD(&nvdimm_bus->list);
|
||||
INIT_LIST_HEAD(&nvdimm_bus->poison_list);
|
||||
init_waitqueue_head(&nvdimm_bus->probe_wait);
|
||||
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
|
||||
mutex_init(&nvdimm_bus->reconfig_mutex);
|
||||
@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
|
||||
|
||||
static void set_badblock(struct badblocks *bb, sector_t s, int num)
|
||||
{
|
||||
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
|
||||
(u64) s * 512, (u64) num * 512);
|
||||
/* this isn't an error as the hardware will still throw an exception */
|
||||
if (badblocks_set(bb, s, num, 1))
|
||||
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
|
||||
__func__, (u64) s);
|
||||
}
|
||||
|
||||
/**
|
||||
* __add_badblock_range() - Convert a physical address range to bad sectors
|
||||
* @bb: badblocks instance to populate
|
||||
* @ns_offset: namespace offset where the error range begins (in bytes)
|
||||
* @len: number of bytes of poison to be added
|
||||
*
|
||||
* This assumes that the range provided with (ns_offset, len) is within
|
||||
* the bounds of physical addresses for this namespace, i.e. lies in the
|
||||
* interval [ns_start, ns_start + ns_size)
|
||||
*/
|
||||
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
|
||||
{
|
||||
const unsigned int sector_size = 512;
|
||||
sector_t start_sector;
|
||||
u64 num_sectors;
|
||||
u32 rem;
|
||||
|
||||
start_sector = div_u64(ns_offset, sector_size);
|
||||
num_sectors = div_u64_rem(len, sector_size, &rem);
|
||||
if (rem)
|
||||
num_sectors++;
|
||||
|
||||
if (unlikely(num_sectors > (u64)INT_MAX)) {
|
||||
u64 remaining = num_sectors;
|
||||
sector_t s = start_sector;
|
||||
|
||||
while (remaining) {
|
||||
int done = min_t(u64, remaining, INT_MAX);
|
||||
|
||||
set_badblock(bb, s, done);
|
||||
remaining -= done;
|
||||
s += done;
|
||||
}
|
||||
} else
|
||||
set_badblock(bb, start_sector, num_sectors);
|
||||
}
|
||||
|
||||
/**
|
||||
* nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
|
||||
* @ndns: the namespace containing poison ranges
|
||||
* @bb: badblocks instance to populate
|
||||
* @offset: offset at the start of the namespace before 'sector 0'
|
||||
*
|
||||
* The poison list generated during NFIT initialization may contain multiple,
|
||||
* possibly overlapping ranges in the SPA (System Physical Address) space.
|
||||
* Compare each of these ranges to the namespace currently being initialized,
|
||||
* and add badblocks to the gendisk for all matching sub-ranges
|
||||
*/
|
||||
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
|
||||
struct badblocks *bb, resource_size_t offset)
|
||||
{
|
||||
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
|
||||
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
|
||||
struct nvdimm_bus *nvdimm_bus;
|
||||
struct list_head *poison_list;
|
||||
u64 ns_start, ns_end, ns_size;
|
||||
struct nd_poison *pl;
|
||||
|
||||
ns_size = nvdimm_namespace_capacity(ndns) - offset;
|
||||
ns_start = nsio->res.start + offset;
|
||||
ns_end = nsio->res.end;
|
||||
|
||||
nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
|
||||
poison_list = &nvdimm_bus->poison_list;
|
||||
if (list_empty(poison_list))
|
||||
return;
|
||||
|
||||
list_for_each_entry(pl, poison_list, list) {
|
||||
u64 pl_end = pl->start + pl->length - 1;
|
||||
|
||||
/* Discard intervals with no intersection */
|
||||
if (pl_end < ns_start)
|
||||
continue;
|
||||
if (pl->start > ns_end)
|
||||
continue;
|
||||
/* Deal with any overlap after start of the namespace */
|
||||
if (pl->start >= ns_start) {
|
||||
u64 start = pl->start;
|
||||
u64 len;
|
||||
|
||||
if (pl_end <= ns_end)
|
||||
len = pl->length;
|
||||
else
|
||||
len = ns_start + ns_size - pl->start;
|
||||
__add_badblock_range(bb, start - ns_start, len);
|
||||
continue;
|
||||
}
|
||||
/* Deal with overlap for poison starting before the namespace */
|
||||
if (pl->start < ns_start) {
|
||||
u64 len;
|
||||
|
||||
if (pl_end < ns_end)
|
||||
len = pl->start + pl->length - ns_start;
|
||||
else
|
||||
len = ns_size;
|
||||
__add_badblock_range(bb, 0, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
|
||||
|
||||
static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
|
||||
{
|
||||
struct nd_poison *pl;
|
||||
|
||||
pl = kzalloc(sizeof(*pl), GFP_KERNEL);
|
||||
if (!pl)
|
||||
return -ENOMEM;
|
||||
|
||||
pl->start = addr;
|
||||
pl->length = length;
|
||||
list_add_tail(&pl->list, &nvdimm_bus->poison_list);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
|
||||
{
|
||||
struct nd_poison *pl;
|
||||
|
||||
if (list_empty(&nvdimm_bus->poison_list))
|
||||
return __add_poison(nvdimm_bus, addr, length);
|
||||
|
||||
/*
|
||||
* There is a chance this is a duplicate, check for those first.
|
||||
* This will be the common case as ARS_STATUS returns all known
|
||||
* errors in the SPA space, and we can't query it per region
|
||||
*/
|
||||
list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
|
||||
if (pl->start == addr) {
|
||||
/* If length has changed, update this list entry */
|
||||
if (pl->length != length)
|
||||
pl->length = length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If not a duplicate or a simple length update, add the entry as is,
|
||||
* as any overlapping ranges will get resolved when the list is consumed
|
||||
* and converted to badblocks
|
||||
*/
|
||||
return __add_poison(nvdimm_bus, addr, length);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
|
||||
|
||||
static void free_poison_list(struct list_head *poison_list)
|
||||
{
|
||||
struct nd_poison *pl, *next;
|
||||
|
||||
list_for_each_entry_safe(pl, next, poison_list, list) {
|
||||
list_del(&pl->list);
|
||||
kfree(pl);
|
||||
}
|
||||
list_del_init(poison_list);
|
||||
}
|
||||
|
||||
static int child_unregister(struct device *dev, void *data)
|
||||
{
|
||||
/*
|
||||
@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
|
||||
|
||||
nd_synchronize();
|
||||
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
|
||||
free_poison_list(&nvdimm_bus->poison_list);
|
||||
nvdimm_bus_destroy_ndctl(nvdimm_bus);
|
||||
|
||||
device_unregister(&nvdimm_bus->dev);
|
||||
|
@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev)
|
||||
return dev ? dev->type == &namespace_io_device_type : false;
|
||||
}
|
||||
|
||||
static int is_uuid_busy(struct device *dev, void *data)
|
||||
{
|
||||
u8 *uuid1 = data, *uuid2 = NULL;
|
||||
|
||||
if (is_namespace_pmem(dev)) {
|
||||
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
|
||||
|
||||
uuid2 = nspm->uuid;
|
||||
} else if (is_namespace_blk(dev)) {
|
||||
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
|
||||
|
||||
uuid2 = nsblk->uuid;
|
||||
} else if (is_nd_btt(dev)) {
|
||||
struct nd_btt *nd_btt = to_nd_btt(dev);
|
||||
|
||||
uuid2 = nd_btt->uuid;
|
||||
} else if (is_nd_pfn(dev)) {
|
||||
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
|
||||
|
||||
uuid2 = nd_pfn->uuid;
|
||||
}
|
||||
|
||||
if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0)
|
||||
return -EBUSY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int is_namespace_uuid_busy(struct device *dev, void *data)
|
||||
{
|
||||
if (is_nd_pmem(dev) || is_nd_blk(dev))
|
||||
return device_for_each_child(dev, data, is_uuid_busy);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* nd_is_uuid_unique - verify that no other namespace has @uuid
|
||||
* @dev: any device on a nvdimm_bus
|
||||
* @uuid: uuid to check
|
||||
*/
|
||||
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
|
||||
{
|
||||
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
|
||||
|
||||
if (!nvdimm_bus)
|
||||
return false;
|
||||
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
|
||||
if (device_for_each_child(&nvdimm_bus->dev, uuid,
|
||||
is_namespace_uuid_busy) != 0)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pmem_should_map_pages(struct device *dev)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev->parent);
|
||||
@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
|
||||
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
|
||||
const char *suffix = NULL;
|
||||
|
||||
if (ndns->claim) {
|
||||
if (is_nd_btt(ndns->claim))
|
||||
suffix = "s";
|
||||
else if (is_nd_pfn(ndns->claim))
|
||||
suffix = "m";
|
||||
else
|
||||
dev_WARN_ONCE(&ndns->dev, 1,
|
||||
"unknown claim type by %s\n",
|
||||
dev_name(ndns->claim));
|
||||
}
|
||||
if (ndns->claim && is_nd_btt(ndns->claim))
|
||||
suffix = "s";
|
||||
|
||||
if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
|
||||
if (!suffix && pmem_should_map_pages(&ndns->dev))
|
||||
suffix = "m";
|
||||
sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
|
||||
} else if (is_namespace_blk(&ndns->dev)) {
|
||||
struct nd_namespace_blk *nsblk;
|
||||
@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
|
||||
res->end = nd_region->ndr_start + size - 1;
|
||||
}
|
||||
|
||||
static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where)
|
||||
{
|
||||
if (!uuid) {
|
||||
dev_dbg(dev, "%s: uuid not set\n", where);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static ssize_t __size_store(struct device *dev, unsigned long long val)
|
||||
{
|
||||
resource_size_t allocated = 0, available = 0;
|
||||
@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
|
||||
* We need a uuid for the allocation-label and dimm(s) on which
|
||||
* to store the label.
|
||||
*/
|
||||
if (!uuid || nd_region->ndr_mappings == 0)
|
||||
if (uuid_not_set(uuid, dev, __func__))
|
||||
return -ENXIO;
|
||||
if (nd_region->ndr_mappings == 0) {
|
||||
dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
|
||||
if (remainder) {
|
||||
@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev,
|
||||
}
|
||||
static DEVICE_ATTR_RO(holder);
|
||||
|
||||
static ssize_t mode_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nd_namespace_common *ndns = to_ndns(dev);
|
||||
struct device *claim;
|
||||
char *mode;
|
||||
ssize_t rc;
|
||||
|
||||
device_lock(dev);
|
||||
claim = ndns->claim;
|
||||
if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
|
||||
mode = "memory";
|
||||
else if (claim && is_nd_btt(claim))
|
||||
mode = "safe";
|
||||
else
|
||||
mode = "raw";
|
||||
rc = sprintf(buf, "%s\n", mode);
|
||||
device_unlock(dev);
|
||||
|
||||
return rc;
|
||||
}
|
||||
static DEVICE_ATTR_RO(mode);
|
||||
|
||||
static ssize_t force_raw_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw);
|
||||
static struct attribute *nd_namespace_attributes[] = {
|
||||
&dev_attr_nstype.attr,
|
||||
&dev_attr_size.attr,
|
||||
&dev_attr_mode.attr,
|
||||
&dev_attr_uuid.attr,
|
||||
&dev_attr_holder.attr,
|
||||
&dev_attr_resource.attr,
|
||||
@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj,
|
||||
|
||||
if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
|
||||
|| a == &dev_attr_holder.attr
|
||||
|| a == &dev_attr_force_raw.attr)
|
||||
|| a == &dev_attr_force_raw.attr
|
||||
|| a == &dev_attr_mode.attr)
|
||||
return a->mode;
|
||||
|
||||
return 0;
|
||||
@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
|
||||
struct nd_namespace_pmem *nspm;
|
||||
|
||||
nspm = to_nd_namespace_pmem(&ndns->dev);
|
||||
if (!nspm->uuid) {
|
||||
dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
|
||||
if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
|
||||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
} else if (is_namespace_blk(&ndns->dev)) {
|
||||
struct nd_namespace_blk *nsblk;
|
||||
|
||||
nsblk = to_nd_namespace_blk(&ndns->dev);
|
||||
if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
|
||||
return ERR_PTR(-ENODEV);
|
||||
if (!nsblk->lbasize) {
|
||||
dev_dbg(&ndns->dev, "%s: sector size not set\n",
|
||||
__func__);
|
||||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
if (!nd_namespace_blk_validate(nsblk))
|
||||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
|
||||
nd_device_register(nd_region->ns_seed);
|
||||
}
|
||||
|
||||
void nd_region_create_pfn_seed(struct nd_region *nd_region)
|
||||
{
|
||||
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
|
||||
nd_region->pfn_seed = nd_pfn_create(nd_region);
|
||||
/*
|
||||
* Seed creation failures are not fatal, provisioning is simply
|
||||
* disabled until memory becomes available
|
||||
*/
|
||||
if (!nd_region->pfn_seed)
|
||||
dev_err(&nd_region->dev, "failed to create pfn namespace\n");
|
||||
}
|
||||
|
||||
void nd_region_create_btt_seed(struct nd_region *nd_region)
|
||||
{
|
||||
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
|
||||
|
@ -30,6 +30,7 @@ struct nvdimm_bus {
|
||||
struct list_head list;
|
||||
struct device dev;
|
||||
int id, probe_active;
|
||||
struct list_head poison_list;
|
||||
struct mutex reconfig_mutex;
|
||||
};
|
||||
|
||||
@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
|
||||
struct nd_region;
|
||||
void nd_region_create_blk_seed(struct nd_region *nd_region);
|
||||
void nd_region_create_btt_seed(struct nd_region *nd_region);
|
||||
void nd_region_create_pfn_seed(struct nd_region *nd_region);
|
||||
void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
|
||||
int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
|
||||
void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
|
||||
|
@ -29,13 +29,12 @@ enum {
|
||||
ND_MAX_LANES = 256,
|
||||
SECTOR_SHIFT = 9,
|
||||
INT_LBASIZE_ALIGNMENT = 64,
|
||||
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
|
||||
ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE,
|
||||
ND_PFN_MASK = ND_PFN_ALIGN - 1,
|
||||
#else
|
||||
ND_PFN_ALIGN = 0,
|
||||
ND_PFN_MASK = 0,
|
||||
#endif
|
||||
};
|
||||
|
||||
struct nd_poison {
|
||||
u64 start;
|
||||
u64 length;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct nvdimm_drvdata {
|
||||
@ -153,6 +152,7 @@ struct nd_pfn {
|
||||
int id;
|
||||
u8 *uuid;
|
||||
struct device dev;
|
||||
unsigned long align;
|
||||
unsigned long npfns;
|
||||
enum nd_pfn_mode mode;
|
||||
struct nd_pfn_sb *pfn_sb;
|
||||
@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
|
||||
int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
|
||||
const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
|
||||
char *name);
|
||||
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
|
||||
struct badblocks *bb, resource_size_t offset);
|
||||
int nd_blk_region_init(struct nd_region *nd_region);
|
||||
void __nd_iostat_start(struct bio *bio, unsigned long *start);
|
||||
static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
|
||||
|
@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev,
|
||||
}
|
||||
static DEVICE_ATTR_RW(mode);
|
||||
|
||||
static ssize_t align_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
|
||||
|
||||
return sprintf(buf, "%lx\n", nd_pfn->align);
|
||||
}
|
||||
|
||||
static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
|
||||
{
|
||||
unsigned long val;
|
||||
int rc;
|
||||
|
||||
rc = kstrtoul(buf, 0, &val);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
|
||||
return -EINVAL;
|
||||
|
||||
if (nd_pfn->dev.driver)
|
||||
return -EBUSY;
|
||||
else
|
||||
nd_pfn->align = val;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t align_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
|
||||
ssize_t rc;
|
||||
|
||||
device_lock(dev);
|
||||
nvdimm_bus_lock(dev);
|
||||
rc = __align_store(nd_pfn, buf);
|
||||
dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
|
||||
rc, buf, buf[len - 1] == '\n' ? "" : "\n");
|
||||
nvdimm_bus_unlock(dev);
|
||||
device_unlock(dev);
|
||||
|
||||
return rc ? rc : len;
|
||||
}
|
||||
static DEVICE_ATTR_RW(align);
|
||||
|
||||
static ssize_t uuid_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = {
|
||||
&dev_attr_mode.attr,
|
||||
&dev_attr_namespace.attr,
|
||||
&dev_attr_uuid.attr,
|
||||
&dev_attr_align.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = {
|
||||
};
|
||||
|
||||
static struct device *__nd_pfn_create(struct nd_region *nd_region,
|
||||
u8 *uuid, enum nd_pfn_mode mode,
|
||||
struct nd_namespace_common *ndns)
|
||||
{
|
||||
struct nd_pfn *nd_pfn;
|
||||
@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nd_pfn->mode = mode;
|
||||
if (uuid)
|
||||
uuid = kmemdup(uuid, 16, GFP_KERNEL);
|
||||
nd_pfn->uuid = uuid;
|
||||
nd_pfn->mode = PFN_MODE_NONE;
|
||||
nd_pfn->align = HPAGE_SIZE;
|
||||
dev = &nd_pfn->dev;
|
||||
dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
|
||||
dev->parent = &nd_region->dev;
|
||||
@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
|
||||
|
||||
struct device *nd_pfn_create(struct nd_region *nd_region)
|
||||
{
|
||||
struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE,
|
||||
NULL);
|
||||
struct device *dev = __nd_pfn_create(nd_region, NULL);
|
||||
|
||||
if (dev)
|
||||
__nd_device_register(dev);
|
||||
@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
|
||||
|
||||
int nd_pfn_validate(struct nd_pfn *nd_pfn)
|
||||
{
|
||||
struct nd_namespace_common *ndns = nd_pfn->ndns;
|
||||
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
|
||||
struct nd_namespace_io *nsio;
|
||||
u64 checksum, offset;
|
||||
struct nd_namespace_io *nsio;
|
||||
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
|
||||
struct nd_namespace_common *ndns = nd_pfn->ndns;
|
||||
const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
|
||||
|
||||
if (!pfn_sb || !ndns)
|
||||
return -ENODEV;
|
||||
@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
|
||||
if (!is_nd_pmem(nd_pfn->dev.parent))
|
||||
return -ENODEV;
|
||||
|
||||
/* section alignment for simple hotplug */
|
||||
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN)
|
||||
return -ENODEV;
|
||||
|
||||
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
|
||||
return -ENXIO;
|
||||
|
||||
@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
|
||||
return -ENODEV;
|
||||
pfn_sb->checksum = cpu_to_le64(checksum);
|
||||
|
||||
if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
|
||||
return -ENODEV;
|
||||
|
||||
switch (le32_to_cpu(pfn_sb->mode)) {
|
||||
case PFN_MODE_RAM:
|
||||
break;
|
||||
@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
|
||||
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
|
||||
nd_pfn->align, nvdimm_namespace_capacity(ndns));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* These warnings are verbose because they can only trigger in
|
||||
* the case where the physical address alignment of the
|
||||
@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
|
||||
*/
|
||||
offset = le64_to_cpu(pfn_sb->dataoff);
|
||||
nsio = to_nd_namespace_io(&ndns->dev);
|
||||
if (nsio->res.start & ND_PFN_MASK) {
|
||||
dev_err(&nd_pfn->dev,
|
||||
"init failed: %s not section aligned\n",
|
||||
dev_name(&ndns->dev));
|
||||
return -EBUSY;
|
||||
} else if (offset >= resource_size(&nsio->res)) {
|
||||
if (offset >= resource_size(&nsio->res)) {
|
||||
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
|
||||
dev_name(&ndns->dev));
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
nd_pfn->align = 1UL << ilog2(offset);
|
||||
if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
|
||||
dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
|
||||
offset);
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(nd_pfn_validate);
|
||||
@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
|
||||
return -ENODEV;
|
||||
|
||||
nvdimm_bus_lock(&ndns->dev);
|
||||
dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns);
|
||||
dev = __nd_pfn_create(nd_region, ndns);
|
||||
nvdimm_bus_unlock(&ndns->dev);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pmem.h>
|
||||
@ -41,11 +42,25 @@ struct pmem_device {
|
||||
phys_addr_t data_offset;
|
||||
void __pmem *virt_addr;
|
||||
size_t size;
|
||||
struct badblocks bb;
|
||||
};
|
||||
|
||||
static int pmem_major;
|
||||
|
||||
static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
|
||||
{
|
||||
if (bb->count) {
|
||||
sector_t first_bad;
|
||||
int num_bad;
|
||||
|
||||
return !!badblocks_check(bb, sector, len / 512, &first_bad,
|
||||
&num_bad);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
unsigned int len, unsigned int off, int rw,
|
||||
sector_t sector)
|
||||
{
|
||||
@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
|
||||
|
||||
if (rw == READ) {
|
||||
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
|
||||
return -EIO;
|
||||
memcpy_from_pmem(mem + off, pmem_addr, len);
|
||||
flush_dcache_page(page);
|
||||
} else {
|
||||
@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
}
|
||||
|
||||
kunmap_atomic(mem);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
int rc = 0;
|
||||
bool do_acct;
|
||||
unsigned long start;
|
||||
struct bio_vec bvec;
|
||||
@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
|
||||
struct pmem_device *pmem = bdev->bd_disk->private_data;
|
||||
|
||||
do_acct = nd_iostat_start(bio, &start);
|
||||
bio_for_each_segment(bvec, bio, iter)
|
||||
pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
|
||||
bio_data_dir(bio), iter.bi_sector);
|
||||
bio_for_each_segment(bvec, bio, iter) {
|
||||
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
|
||||
bvec.bv_offset, bio_data_dir(bio),
|
||||
iter.bi_sector);
|
||||
if (rc) {
|
||||
bio->bi_error = rc;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (do_acct)
|
||||
nd_iostat_end(bio, start);
|
||||
|
||||
@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, int rw)
|
||||
{
|
||||
struct pmem_device *pmem = bdev->bd_disk->private_data;
|
||||
int rc;
|
||||
|
||||
pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
|
||||
rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
|
||||
if (rw & WRITE)
|
||||
wmb_pmem();
|
||||
page_endio(page, rw & WRITE, 0);
|
||||
|
||||
return 0;
|
||||
/*
|
||||
* The ->rw_page interface is subtle and tricky. The core
|
||||
* retries on any error, so we can only invoke page_endio() in
|
||||
* the successful completion case. Otherwise, we'll see crashes
|
||||
* caused by double completion.
|
||||
*/
|
||||
if (rc == 0)
|
||||
page_endio(page, rw & WRITE, 0);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static long pmem_direct_access(struct block_device *bdev, sector_t sector,
|
||||
@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev,
|
||||
disk->driverfs_dev = dev;
|
||||
set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
|
||||
pmem->pmem_disk = disk;
|
||||
devm_exit_badblocks(dev, &pmem->bb);
|
||||
if (devm_init_badblocks(dev, &pmem->bb))
|
||||
return -ENOMEM;
|
||||
nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
|
||||
|
||||
disk->bb = &pmem->bb;
|
||||
add_disk(disk);
|
||||
revalidate_disk(disk);
|
||||
|
||||
@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
if (rw == READ)
|
||||
if (rw == READ) {
|
||||
unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
|
||||
|
||||
if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
|
||||
return -EIO;
|
||||
memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
|
||||
else {
|
||||
} else {
|
||||
memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
|
||||
wmb_pmem();
|
||||
}
|
||||
@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
|
||||
|
||||
nd_pfn->pfn_sb = pfn_sb;
|
||||
rc = nd_pfn_validate(nd_pfn);
|
||||
if (rc == 0 || rc == -EBUSY)
|
||||
if (rc == -ENODEV)
|
||||
/* no info block, do init */;
|
||||
else
|
||||
return rc;
|
||||
|
||||
/* section alignment for simple hotplug */
|
||||
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN
|
||||
|| pmem->phys_addr & ND_PFN_MASK)
|
||||
return -ENODEV;
|
||||
|
||||
nd_region = to_nd_region(nd_pfn->dev.parent);
|
||||
if (nd_region->ro) {
|
||||
dev_info(&nd_pfn->dev,
|
||||
@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
|
||||
* ->direct_access() to those that are included in the memmap.
|
||||
*/
|
||||
if (nd_pfn->mode == PFN_MODE_PMEM)
|
||||
offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE);
|
||||
offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
|
||||
else if (nd_pfn->mode == PFN_MODE_RAM)
|
||||
offset = SZ_8K;
|
||||
offset = ALIGN(SZ_8K, nd_pfn->align);
|
||||
else
|
||||
goto err;
|
||||
|
||||
@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
|
||||
pfn_sb->npfns = cpu_to_le64(npfns);
|
||||
memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
|
||||
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
|
||||
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
|
||||
pfn_sb->version_major = cpu_to_le16(1);
|
||||
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
|
||||
pfn_sb->checksum = cpu_to_le64(checksum);
|
||||
@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
if (PAGE_SIZE != SZ_4K) {
|
||||
dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n");
|
||||
return -ENXIO;
|
||||
}
|
||||
if (nsio->res.start & ND_PFN_MASK) {
|
||||
dev_err(dev, "%s not memory hotplug section aligned\n",
|
||||
dev_name(&ndns->dev));
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
pfn_sb = nd_pfn->pfn_sb;
|
||||
offset = le64_to_cpu(pfn_sb->dataoff);
|
||||
nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
|
||||
if (nd_pfn->mode == PFN_MODE_RAM) {
|
||||
if (offset != SZ_8K)
|
||||
if (offset < SZ_8K)
|
||||
return -EINVAL;
|
||||
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
|
||||
altmap = NULL;
|
||||
@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev)
|
||||
pmem->ndns = ndns;
|
||||
dev_set_drvdata(dev, pmem);
|
||||
ndns->rw_bytes = pmem_rw_bytes;
|
||||
if (devm_init_badblocks(dev, &pmem->bb))
|
||||
return -ENOMEM;
|
||||
nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
|
||||
|
||||
if (is_nd_btt(dev))
|
||||
return nvdimm_namespace_attach_btt(ndns);
|
||||
|
@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
|
||||
}
|
||||
EXPORT_SYMBOL(nd_region_to_nstype);
|
||||
|
||||
static int is_uuid_busy(struct device *dev, void *data)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev->parent);
|
||||
u8 *uuid = data;
|
||||
|
||||
switch (nd_region_to_nstype(nd_region)) {
|
||||
case ND_DEVICE_NAMESPACE_PMEM: {
|
||||
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
|
||||
|
||||
if (!nspm->uuid)
|
||||
break;
|
||||
if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
|
||||
return -EBUSY;
|
||||
break;
|
||||
}
|
||||
case ND_DEVICE_NAMESPACE_BLK: {
|
||||
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
|
||||
|
||||
if (!nsblk->uuid)
|
||||
break;
|
||||
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
|
||||
return -EBUSY;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int is_namespace_uuid_busy(struct device *dev, void *data)
|
||||
{
|
||||
if (is_nd_pmem(dev) || is_nd_blk(dev))
|
||||
return device_for_each_child(dev, data, is_uuid_busy);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* nd_is_uuid_unique - verify that no other namespace has @uuid
|
||||
* @dev: any device on a nvdimm_bus
|
||||
* @uuid: uuid to check
|
||||
*/
|
||||
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
|
||||
{
|
||||
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
|
||||
|
||||
if (!nvdimm_bus)
|
||||
return false;
|
||||
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
|
||||
if (device_for_each_child(&nvdimm_bus->dev, uuid,
|
||||
is_namespace_uuid_busy) != 0)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static ssize_t size_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
|
||||
struct nd_interleave_set *nd_set = nd_region->nd_set;
|
||||
int type = nd_region_to_nstype(nd_region);
|
||||
|
||||
if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
|
||||
return 0;
|
||||
|
||||
if (a != &dev_attr_set_cookie.attr
|
||||
&& a != &dev_attr_available_size.attr)
|
||||
return a->mode;
|
||||
@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
|
||||
nd_region_create_blk_seed(nd_region);
|
||||
nvdimm_bus_unlock(dev);
|
||||
}
|
||||
if (is_nd_pfn(dev) && probe) {
|
||||
nd_region = to_nd_region(dev->parent);
|
||||
nvdimm_bus_lock(dev);
|
||||
if (nd_region->pfn_seed == dev)
|
||||
nd_region_create_pfn_seed(nd_region);
|
||||
nvdimm_bus_unlock(dev);
|
||||
}
|
||||
}
|
||||
|
||||
void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
|
||||
|
122
fs/block_dev.c
122
fs/block_dev.c
@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct inode *bdev_file_inode(struct file *file)
|
||||
{
|
||||
return file->f_mapping->host;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct inode *inode = bdev_file_inode(file);
|
||||
|
||||
if (IS_DAX(inode))
|
||||
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
|
||||
@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
|
||||
*/
|
||||
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
|
||||
{
|
||||
struct inode *bd_inode = file->f_mapping->host;
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
loff_t retval;
|
||||
|
||||
mutex_lock(&bd_inode->i_mutex);
|
||||
@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
|
||||
|
||||
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
|
||||
{
|
||||
struct inode *bd_inode = filp->f_mapping->host;
|
||||
struct inode *bd_inode = bdev_file_inode(filp);
|
||||
struct block_device *bdev = I_BDEV(bd_inode);
|
||||
int error;
|
||||
|
||||
@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
|
||||
}
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
|
||||
if (!blkdev_dax_capable(bdev))
|
||||
bdev->bd_inode->i_flags &= ~S_DAX;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the device is invalidated, rescan partition
|
||||
@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
|
||||
else if (ret == -ENOMEDIUM)
|
||||
invalidate_partitions(disk, bdev);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto out_clear;
|
||||
} else {
|
||||
@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
|
||||
goto out_clear;
|
||||
}
|
||||
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
|
||||
/*
|
||||
* If the partition is not aligned on a page
|
||||
* boundary, we can't do dax I/O to it.
|
||||
*/
|
||||
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
|
||||
(bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
|
||||
if (!blkdev_dax_capable(bdev))
|
||||
bdev->bd_inode->i_flags &= ~S_DAX;
|
||||
}
|
||||
} else {
|
||||
@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put);
|
||||
|
||||
static int blkdev_close(struct inode * inode, struct file * filp)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
|
||||
struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
|
||||
blkdev_put(bdev, filp->f_mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
||||
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
|
||||
fmode_t mode = file->f_mode;
|
||||
|
||||
/*
|
||||
@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *bd_inode = file->f_mapping->host;
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
loff_t size = i_size_read(bd_inode);
|
||||
struct blk_plug plug;
|
||||
ssize_t ret;
|
||||
@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
|
||||
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *bd_inode = file->f_mapping->host;
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
loff_t size = i_size_read(bd_inode);
|
||||
loff_t pos = iocb->ki_pos;
|
||||
|
||||
@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = {
|
||||
.is_dirty_writeback = buffer_check_dirty_writeback,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
/*
|
||||
* In the raw block case we do not need to contend with truncation nor
|
||||
* unwritten file extents. Without those concerns there is no need for
|
||||
* additional locking beyond the mmap_sem context that these routines
|
||||
* are already executing under.
|
||||
*
|
||||
* Note, there is no protection if the block device is dynamically
|
||||
* resized (partition grow/shrink) during a fault. A stable block device
|
||||
* size is already not enforced in the blkdev_direct_IO path.
|
||||
*
|
||||
* For DAX, it is the responsibility of the block device driver to
|
||||
* ensure the whole-disk device size is stable while requests are in
|
||||
* flight.
|
||||
*
|
||||
* Finally, unlike the filemap_page_mkwrite() case there is no
|
||||
* filesystem superblock to sync against freezing. We still include a
|
||||
* pfn_mkwrite callback for dax drivers to receive write fault
|
||||
* notifications.
|
||||
*/
|
||||
static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
|
||||
}
|
||||
|
||||
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
|
||||
}
|
||||
|
||||
static void blkdev_vm_open(struct vm_area_struct *vma)
|
||||
{
|
||||
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
|
||||
struct block_device *bdev = I_BDEV(bd_inode);
|
||||
|
||||
mutex_lock(&bd_inode->i_mutex);
|
||||
bdev->bd_map_count++;
|
||||
mutex_unlock(&bd_inode->i_mutex);
|
||||
}
|
||||
|
||||
static void blkdev_vm_close(struct vm_area_struct *vma)
|
||||
{
|
||||
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
|
||||
struct block_device *bdev = I_BDEV(bd_inode);
|
||||
|
||||
mutex_lock(&bd_inode->i_mutex);
|
||||
bdev->bd_map_count--;
|
||||
mutex_unlock(&bd_inode->i_mutex);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct blkdev_dax_vm_ops = {
|
||||
.open = blkdev_vm_open,
|
||||
.close = blkdev_vm_close,
|
||||
.fault = blkdev_dax_fault,
|
||||
.pmd_fault = blkdev_dax_pmd_fault,
|
||||
.pfn_mkwrite = blkdev_dax_fault,
|
||||
};
|
||||
|
||||
static const struct vm_operations_struct blkdev_default_vm_ops = {
|
||||
.open = blkdev_vm_open,
|
||||
.close = blkdev_vm_close,
|
||||
.fault = filemap_fault,
|
||||
.map_pages = filemap_map_pages,
|
||||
};
|
||||
|
||||
static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
struct block_device *bdev = I_BDEV(bd_inode);
|
||||
|
||||
file_accessed(file);
|
||||
mutex_lock(&bd_inode->i_mutex);
|
||||
bdev->bd_map_count++;
|
||||
if (IS_DAX(bd_inode)) {
|
||||
vma->vm_ops = &blkdev_dax_vm_ops;
|
||||
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||
} else {
|
||||
vma->vm_ops = &blkdev_default_vm_ops;
|
||||
}
|
||||
mutex_unlock(&bd_inode->i_mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#define blkdev_mmap generic_file_mmap
|
||||
#endif
|
||||
|
||||
const struct file_operations def_blk_fops = {
|
||||
.open = blkdev_open,
|
||||
.release = blkdev_close,
|
||||
.llseek = block_llseek,
|
||||
.read_iter = blkdev_read_iter,
|
||||
.write_iter = blkdev_write_iter,
|
||||
.mmap = generic_file_mmap,
|
||||
.mmap = blkdev_mmap,
|
||||
.fsync = blkdev_fsync,
|
||||
.unlocked_ioctl = block_ioctl,
|
||||
#ifdef CONFIG_COMPAT
|
||||
|
65
include/linux/badblocks.h
Normal file
65
include/linux/badblocks.h
Normal file
@ -0,0 +1,65 @@
|
||||
#ifndef _LINUX_BADBLOCKS_H
|
||||
#define _LINUX_BADBLOCKS_H
|
||||
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define BB_LEN_MASK (0x00000000000001FFULL)
|
||||
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
|
||||
#define BB_ACK_MASK (0x8000000000000000ULL)
|
||||
#define BB_MAX_LEN 512
|
||||
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
|
||||
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
|
||||
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
|
||||
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
|
||||
|
||||
/* Bad block numbers are stored sorted in a single page.
|
||||
* 64bits is used for each block or extent.
|
||||
* 54 bits are sector number, 9 bits are extent size,
|
||||
* 1 bit is an 'acknowledged' flag.
|
||||
*/
|
||||
#define MAX_BADBLOCKS (PAGE_SIZE/8)
|
||||
|
||||
struct badblocks {
|
||||
struct device *dev; /* set by devm_init_badblocks */
|
||||
int count; /* count of bad blocks */
|
||||
int unacked_exist; /* there probably are unacknowledged
|
||||
* bad blocks. This is only cleared
|
||||
* when a read discovers none
|
||||
*/
|
||||
int shift; /* shift from sectors to block size
|
||||
* a -ve shift means badblocks are
|
||||
* disabled.*/
|
||||
u64 *page; /* badblock list */
|
||||
int changed;
|
||||
seqlock_t lock;
|
||||
sector_t sector;
|
||||
sector_t size; /* in sectors */
|
||||
};
|
||||
|
||||
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors);
|
||||
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
|
||||
int acknowledged);
|
||||
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
|
||||
void ack_all_badblocks(struct badblocks *bb);
|
||||
ssize_t badblocks_show(struct badblocks *bb, char *page, int unack);
|
||||
ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
|
||||
int unack);
|
||||
int badblocks_init(struct badblocks *bb, int enable);
|
||||
void badblocks_exit(struct badblocks *bb);
|
||||
struct device;
|
||||
int devm_init_badblocks(struct device *dev, struct badblocks *bb);
|
||||
static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
|
||||
{
|
||||
if (bb->dev != dev) {
|
||||
dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n",
|
||||
__func__);
|
||||
return;
|
||||
}
|
||||
badblocks_exit(bb);
|
||||
}
|
||||
#endif
|
@ -483,6 +483,9 @@ struct block_device {
|
||||
int bd_fsfreeze_count;
|
||||
/* Mutex for freeze */
|
||||
struct mutex bd_fsfreeze_mutex;
|
||||
#ifdef CONFIG_FS_DAX
|
||||
int bd_map_count;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *);
|
||||
extern void emergency_thaw_all(void);
|
||||
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
|
||||
extern int fsync_bdev(struct block_device *);
|
||||
#ifdef CONFIG_FS_DAX
|
||||
extern bool blkdev_dax_capable(struct block_device *bdev);
|
||||
#else
|
||||
static inline bool blkdev_dax_capable(struct block_device *bdev)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern struct super_block *blockdev_superblock;
|
||||
|
||||
|
@ -162,6 +162,7 @@ struct disk_part_tbl {
|
||||
};
|
||||
|
||||
struct disk_events;
|
||||
struct badblocks;
|
||||
|
||||
#if defined(CONFIG_BLK_DEV_INTEGRITY)
|
||||
|
||||
@ -213,6 +214,7 @@ struct gendisk {
|
||||
struct kobject integrity_kobj;
|
||||
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
int node_id;
|
||||
struct badblocks *bb;
|
||||
};
|
||||
|
||||
static inline struct gendisk *part_to_disk(struct hd_struct *part)
|
||||
|
@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
|
||||
|
||||
}
|
||||
|
||||
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
|
||||
struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
|
||||
struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
|
||||
#define nvdimm_bus_register(parent, desc) \
|
||||
|
@ -188,6 +188,8 @@ struct inodes_stat_t {
|
||||
#define BLKSECDISCARD _IO(0x12,125)
|
||||
#define BLKROTATIONAL _IO(0x12,126)
|
||||
#define BLKZEROOUT _IO(0x12,127)
|
||||
#define BLKDAXSET _IO(0x12,128)
|
||||
#define BLKDAXGET _IO(0x12,129)
|
||||
|
||||
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
|
||||
#define FIBMAP _IO(0x00,1) /* bmap access */
|
||||
|
@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
|
||||
break;
|
||||
if (p->end < addr)
|
||||
continue;
|
||||
if (p->flags & IORESOURCE_BUSY &&
|
||||
p->flags & IORESOURCE_EXCLUSIVE) {
|
||||
/*
|
||||
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
|
||||
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
|
||||
* resource is busy.
|
||||
*/
|
||||
if ((p->flags & IORESOURCE_BUSY) == 0)
|
||||
continue;
|
||||
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|
||||
|| p->flags & IORESOURCE_EXCLUSIVE) {
|
||||
err = 1;
|
||||
break;
|
||||
}
|
||||
|
@ -1886,3 +1886,42 @@ source "samples/Kconfig"
|
||||
|
||||
source "lib/Kconfig.kgdb"
|
||||
|
||||
config ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
bool
|
||||
|
||||
config STRICT_DEVMEM
|
||||
bool "Filter access to /dev/mem"
|
||||
depends on MMU
|
||||
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
|
||||
default y if TILE || PPC
|
||||
---help---
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
of memory, including kernel and userspace memory. Accidental
|
||||
access to this is obviously disastrous, but specific access can
|
||||
be used by people debugging the kernel. Note that with PAT support
|
||||
enabled, even in this case there are restrictions on /dev/mem
|
||||
use due to the cache aliasing requirements.
|
||||
|
||||
If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
|
||||
file only allows userspace access to PCI space and the BIOS code and
|
||||
data regions. This is sufficient for dosemu and X and all common
|
||||
users of /dev/mem.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
||||
config IO_STRICT_DEVMEM
|
||||
bool "Filter I/O access to /dev/mem"
|
||||
depends on STRICT_DEVMEM
|
||||
default STRICT_DEVMEM
|
||||
---help---
|
||||
If this option is disabled, you allow userspace (root) access to all
|
||||
io-memory regardless of whether a driver is actively using that
|
||||
range. Accidental access to this is obviously disastrous, but
|
||||
specific access can be used by people debugging kernel drivers.
|
||||
|
||||
If this option is switched on, the /dev/mem file only allows
|
||||
userspace access to *idle* io-memory ranges (see /proc/iomem) This
|
||||
may break traditional users of /dev/mem (dosemu, legacy X, etc...)
|
||||
if the driver using a given range cannot be disabled.
|
||||
|
||||
If in doubt, say Y.
|
||||
|
@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap
|
||||
ldflags-y += --wrap=__devm_request_region
|
||||
ldflags-y += --wrap=__request_region
|
||||
ldflags-y += --wrap=__release_region
|
||||
ldflags-y += --wrap=devm_memremap_pages
|
||||
ldflags-y += --wrap=phys_to_pfn_t
|
||||
|
||||
DRIVERS := ../../../drivers
|
||||
NVDIMM_SRC := $(DRIVERS)/nvdimm
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/mm.h>
|
||||
#include "nfit_test.h"
|
||||
|
||||
static LIST_HEAD(iomap_head);
|
||||
@ -41,7 +42,7 @@ void nfit_test_teardown(void)
|
||||
}
|
||||
EXPORT_SYMBOL(nfit_test_teardown);
|
||||
|
||||
static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
|
||||
static struct nfit_test_resource *__get_nfit_res(resource_size_t resource)
|
||||
{
|
||||
struct iomap_ops *ops;
|
||||
|
||||
@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
|
||||
{
|
||||
struct nfit_test_resource *res;
|
||||
|
||||
rcu_read_lock();
|
||||
res = __get_nfit_res(resource);
|
||||
rcu_read_unlock();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
|
||||
void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(offset);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return (void __iomem *) nfit_res->buf + offset
|
||||
- nfit_res->res->start;
|
||||
@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
|
||||
void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
|
||||
resource_size_t offset, unsigned long size)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(offset);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return (void __iomem *) nfit_res->buf + offset
|
||||
- nfit_res->res->start;
|
||||
@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
|
||||
void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
|
||||
size_t size, unsigned long flags)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(offset);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return nfit_res->buf + offset - nfit_res->res->start;
|
||||
return devm_memremap(dev, offset, size, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(__wrap_devm_memremap);
|
||||
|
||||
#ifdef __HAVE_ARCH_PTE_DEVMAP
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pfn_t.h>
|
||||
|
||||
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
struct percpu_ref *ref, struct vmem_altmap *altmap)
|
||||
{
|
||||
resource_size_t offset = res->start;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
if (nfit_res)
|
||||
return nfit_res->buf + offset - nfit_res->res->start;
|
||||
return devm_memremap_pages(dev, res, ref, altmap);
|
||||
}
|
||||
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
|
||||
|
||||
pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(addr);
|
||||
|
||||
if (nfit_res)
|
||||
flags &= ~PFN_MAP;
|
||||
return phys_to_pfn_t(addr, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
|
||||
#else
|
||||
/* to be removed post 4.5-rc1 */
|
||||
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res)
|
||||
{
|
||||
resource_size_t offset = res->start;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
if (nfit_res)
|
||||
return nfit_res->buf + offset - nfit_res->res->start;
|
||||
return devm_memremap_pages(dev, res);
|
||||
}
|
||||
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
|
||||
#endif
|
||||
|
||||
void *__wrap_memremap(resource_size_t offset, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(offset);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return nfit_res->buf + offset - nfit_res->res->start;
|
||||
return memremap(offset, size, flags);
|
||||
@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap);
|
||||
|
||||
void __wrap_devm_memunmap(struct device *dev, void *addr)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res((unsigned long) addr);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return;
|
||||
return devm_memunmap(dev, addr);
|
||||
@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc);
|
||||
|
||||
void __wrap_iounmap(volatile void __iomem *addr)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res((unsigned long) addr);
|
||||
rcu_read_unlock();
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
|
||||
if (nfit_res)
|
||||
return;
|
||||
return iounmap(addr);
|
||||
@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap);
|
||||
|
||||
void __wrap_memunmap(void *addr)
|
||||
{
|
||||
struct nfit_test_resource *nfit_res;
|
||||
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
|
||||
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res((unsigned long) addr);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res)
|
||||
return;
|
||||
return memunmap(addr);
|
||||
@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev,
|
||||
struct nfit_test_resource *nfit_res;
|
||||
|
||||
if (parent == &iomem_resource) {
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(start);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res) {
|
||||
struct resource *res = nfit_res->res + 1;
|
||||
|
||||
@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
|
||||
struct nfit_test_resource *nfit_res;
|
||||
|
||||
if (parent == &iomem_resource) {
|
||||
rcu_read_lock();
|
||||
nfit_res = get_nfit_res(start);
|
||||
rcu_read_unlock();
|
||||
if (nfit_res) {
|
||||
struct resource *res = nfit_res->res + 1;
|
||||
|
||||
|
@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
|
||||
|
||||
nd_cmd->out_length = 256;
|
||||
nd_cmd->num_records = 0;
|
||||
nd_cmd->address = 0;
|
||||
nd_cmd->length = -1ULL;
|
||||
nd_cmd->status = 0;
|
||||
|
||||
return 0;
|
||||
@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t)
|
||||
struct acpi_nfit_memory_map *memdev;
|
||||
struct acpi_nfit_control_region *dcr;
|
||||
struct acpi_nfit_system_address *spa;
|
||||
struct nvdimm_bus_descriptor *nd_desc;
|
||||
struct acpi_nfit_desc *acpi_desc;
|
||||
|
||||
offset = 0;
|
||||
/* spa0 (flat range with no bdw aliasing) */
|
||||
@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t)
|
||||
dcr->command_size = 0;
|
||||
dcr->status_offset = 0;
|
||||
dcr->status_size = 0;
|
||||
|
||||
acpi_desc = &t->acpi_desc;
|
||||
set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
|
||||
set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
|
||||
set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
|
||||
nd_desc = &acpi_desc->nd_desc;
|
||||
nd_desc->ndctl = nfit_test_ctl;
|
||||
}
|
||||
|
||||
static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
|
||||
|
Loading…
Reference in New Issue
Block a user