mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-01 10:42:11 +00:00
libnvdimm for 5.7
- Add support for region alignment configuration and enforcement to fix compatibility across architectures and PowerPC page size configurations. - Introduce 'zero_page_range' as a dax operation. This facilitates filesystem-dax operation without a block-device. - Introduce phys_to_target_node() to facilitate drivers that want to know resulting numa node if a given reserved address range was onlined. - Advertise a persistence-domain for of_pmem and papr_scm. The persistence domain indicates where cpu-store cycles need to reach in the platform-memory subsystem before the platform will consider them power-fail protected. - Promote numa_map_to_online_node() to a cross-kernel generic facility. - Save x86 numa information to allow for node-id lookups for reserved memory ranges, deploy that capability for the e820-pmem driver. - Pick up some miscellaneous minor fixes, that missed v5.6-final, including a some smatch reports in the ioctl path and some unit test compilation fixups. - Fixup some flexible-array declarations. -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEf41QbsdZzFdA8EfZHtKRamZ9iAIFAl6LtIAACgkQHtKRamZ9 iAIwRA/8CLVVuQpgHQ1tqK4h8CZPrISFXh7wy7uhocEU2xrDh6iGVnLztmoLRr2k 5f8T9lRzreSAwIVL5DbGqP1pFncqIt9VMnKsFlaPMBGCBNR+hURY0iBCNjIT+jiq BOzLd52MR2rqJxeXGTMUbWrBrbmuj4mZPdmGVuFFe7GFRpoaVpCgOo+296eWa/ot gIOFUTonZY7STYjNvDok0TXCmiCFuJb+P+y5ldfCPShHvZhTiaF53jircja8vAjO G5dt8ixBKUK0rXRc4SEQsQhAZNcAFHb6Gy5lg4C2QzhTF374xTc9usJZNWbIE9iM 5mipBYvjVuoY+XaCNZDkaRcJIy/jqB15O6l3QIWbZLGaK9m95YPp9LmkPFwd3JpO e3rO24ML471DxqB9iWIiJCNcBBocLOlnd6qAQTpppWDpGNbudwXvfsmKHmKIScSE x+IDCdscLmmm+WG2dLmLraWOVPu42xZFccoQCi4M3TTqfeB9pZ9XckFQ37zX62zG 5t+7Ek+t1W4QVt/JQYVKH03XT15sqUpVknvx0Hl4Y5TtbDOkFLkO8RN0/HyExDef 7iegS35kqTsM4EfZQ+9juKbI2JBAjHANcbj0V4dogqaRj6vr3akumBzUtuYqAofv qU3s9skmLsEemOJC+ns2PT8vl5dyIoeDfH0r2XvGWxYqolMqJpA= =sY4N -----END PGP SIGNATURE----- Merge tag 'libnvdimm-for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm and dax updates from Dan Williams: "There were multiple touches outside of drivers/nvdimm/ this round to add cross arch compatibility to the devm_memremap_pages() interface, enhance numa information for persistent memory ranges, and add a zero_page_range() dax operation. This cycle I switched from the patchwork api to Konstantin's b4 script for collecting tags (from x86, PowerPC, filesystem, and device-mapper folks), and everything looks to have gone ok there. This has all appeared in -next with no reported issues. Summary: - Add support for region alignment configuration and enforcement to fix compatibility across architectures and PowerPC page size configurations. - Introduce 'zero_page_range' as a dax operation. This facilitates filesystem-dax operation without a block-device. - Introduce phys_to_target_node() to facilitate drivers that want to know resulting numa node if a given reserved address range was onlined. - Advertise a persistence-domain for of_pmem and papr_scm. The persistence domain indicates where cpu-store cycles need to reach in the platform-memory subsystem before the platform will consider them power-fail protected. - Promote numa_map_to_online_node() to a cross-kernel generic facility. - Save x86 numa information to allow for node-id lookups for reserved memory ranges, deploy that capability for the e820-pmem driver. - Pick up some miscellaneous minor fixes, that missed v5.6-final, including a some smatch reports in the ioctl path and some unit test compilation fixups. - Fixup some flexible-array declarations" * tag 'libnvdimm-for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (29 commits) dax: Move mandatory ->zero_page_range() check in alloc_dax() dax,iomap: Add helper dax_iomap_zero() to zero a range dax: Use new dax zero page method for zeroing a page dm,dax: Add dax zero_page_range operation s390,dcssblk,dax: Add dax zero_page_range operation to dcssblk driver dax, pmem: Add a dax operation zero_page_range pmem: Add functions for reading/writing page to/from pmem libnvdimm: Update persistence domain value for of_pmem and papr_scm device tools/test/nvdimm: Fix out of tree build libnvdimm/region: Fix build error libnvdimm/region: Replace zero-length array with flexible-array member libnvdimm/label: Replace zero-length array with flexible-array member ACPI: NFIT: Replace zero-length array with flexible-array member libnvdimm/region: Introduce an 'align' attribute libnvdimm/region: Introduce NDD_LABELING libnvdimm/namespace: Enforce memremap_compat_align() libnvdimm/pfn: Prevent raw mode fallback if pfn-infoblock valid libnvdimm: Out of bounds read in __nd_ioctl() acpi/nfit: improve bounds checking for 'func' mm/memremap_pages: Introduce memremap_compat_align() ...
This commit is contained in:
commit
9b06860d7c
@ -9670,6 +9670,7 @@ F: drivers/acpi/nfit/*
|
||||
F: include/linux/nd.h
|
||||
F: include/linux/libnvdimm.h
|
||||
F: include/uapi/linux/ndctl.h
|
||||
F: tools/testing/nvdimm/
|
||||
|
||||
LICENSES and SPDX stuff
|
||||
M: Thomas Gleixner <tglx@linutronix.de>
|
||||
|
@ -122,6 +122,7 @@ config PPC
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
||||
select ARCH_HAS_KCOV
|
||||
select ARCH_HAS_HUGEPD if HUGETLB_PAGE
|
||||
select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
|
||||
select ARCH_HAS_MMIOWB if PPC64
|
||||
select ARCH_HAS_PHYS_TO_DMA
|
||||
select ARCH_HAS_PMEM_API
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <linux/io.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <asm/io-workarounds.h>
|
||||
|
||||
@ -97,3 +98,23 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZONE_DEVICE
|
||||
/*
|
||||
* Override the generic version in mm/memremap.c.
|
||||
*
|
||||
* With hash translation, the direct-map range is mapped with just one
|
||||
* page size selected by htab_init_page_sizes(). Consult
|
||||
* mmu_psize_defs[] to determine the minimum page size alignment.
|
||||
*/
|
||||
unsigned long memremap_compat_align(void)
|
||||
{
|
||||
unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
|
||||
|
||||
if (radix_enabled())
|
||||
return SUBSECTION_SIZE;
|
||||
return max(SUBSECTION_SIZE, 1UL << shift);
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memremap_compat_align);
|
||||
#endif
|
||||
|
@ -286,25 +286,6 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int papr_scm_node(int node)
|
||||
{
|
||||
int min_dist = INT_MAX, dist;
|
||||
int nid, min_node;
|
||||
|
||||
if ((node == NUMA_NO_NODE) || node_online(node))
|
||||
return node;
|
||||
|
||||
min_node = first_online_node;
|
||||
for_each_online_node(nid) {
|
||||
dist = node_distance(node, nid);
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
min_node = nid;
|
||||
}
|
||||
}
|
||||
return min_node;
|
||||
}
|
||||
|
||||
static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
|
||||
{
|
||||
struct device *dev = &p->pdev->dev;
|
||||
@ -329,7 +310,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
|
||||
}
|
||||
|
||||
dimm_flags = 0;
|
||||
set_bit(NDD_ALIASING, &dimm_flags);
|
||||
set_bit(NDD_LABELING, &dimm_flags);
|
||||
|
||||
p->nvdimm = nvdimm_create(p->bus, p, NULL, dimm_flags,
|
||||
PAPR_SCM_DIMM_CMD_MASK, 0, NULL);
|
||||
@ -350,7 +331,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
|
||||
|
||||
memset(&ndr_desc, 0, sizeof(ndr_desc));
|
||||
target_nid = dev_to_node(&p->pdev->dev);
|
||||
online_nid = papr_scm_node(target_nid);
|
||||
online_nid = numa_map_to_online_node(target_nid);
|
||||
ndr_desc.numa_node = online_nid;
|
||||
ndr_desc.target_node = target_nid;
|
||||
ndr_desc.res = &p->res;
|
||||
@ -362,8 +343,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
|
||||
|
||||
if (p->is_volatile)
|
||||
p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);
|
||||
else
|
||||
else {
|
||||
set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
|
||||
p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc);
|
||||
}
|
||||
if (!p->region) {
|
||||
dev_err(dev, "Error registering region %pR from %pOF\n",
|
||||
ndr_desc.res, p->dn);
|
||||
|
@ -1661,6 +1661,7 @@ config X86_PMEM_LEGACY
|
||||
depends on PHYS_ADDR_T_64BIT
|
||||
depends on BLK_DEV
|
||||
select X86_PMEM_LEGACY_DEVICE
|
||||
select NUMA_KEEP_MEMINFO if NUMA
|
||||
select LIBNVDIMM
|
||||
help
|
||||
Treat memory marked using the non-standard e820 type of 12 as used
|
||||
|
@ -25,11 +25,8 @@ nodemask_t numa_nodes_parsed __initdata;
|
||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
||||
EXPORT_SYMBOL(node_data);
|
||||
|
||||
static struct numa_meminfo numa_meminfo
|
||||
#ifndef CONFIG_MEMORY_HOTPLUG
|
||||
__initdata
|
||||
#endif
|
||||
;
|
||||
static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
|
||||
static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
|
||||
|
||||
static int numa_distance_cnt;
|
||||
static u8 *numa_distance;
|
||||
@ -168,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
||||
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
||||
}
|
||||
|
||||
/**
|
||||
* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
|
||||
* @dst: numa_meminfo to append block to
|
||||
* @idx: Index of memblk to remove
|
||||
* @src: numa_meminfo to remove memblk from
|
||||
*/
|
||||
static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
|
||||
struct numa_meminfo *src)
|
||||
{
|
||||
dst->blk[dst->nr_blks++] = src->blk[idx];
|
||||
numa_remove_memblk_from(idx, src);
|
||||
}
|
||||
|
||||
/**
|
||||
* numa_add_memblk - Add one numa_memblk to numa_meminfo
|
||||
* @nid: NUMA node ID of the new memblk
|
||||
@ -237,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
||||
for (i = 0; i < mi->nr_blks; i++) {
|
||||
struct numa_memblk *bi = &mi->blk[i];
|
||||
|
||||
/* make sure all blocks are inside the limits */
|
||||
/* move / save reserved memory ranges */
|
||||
if (!memblock_overlaps_region(&memblock.memory,
|
||||
bi->start, bi->end - bi->start)) {
|
||||
numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* make sure all non-reserved blocks are inside the limits */
|
||||
bi->start = max(bi->start, low);
|
||||
bi->end = min(bi->end, high);
|
||||
|
||||
/* and there's no empty or non-exist block */
|
||||
if (bi->start >= bi->end ||
|
||||
!memblock_overlaps_region(&memblock.memory,
|
||||
bi->start, bi->end - bi->start))
|
||||
/* and there's no empty block */
|
||||
if (bi->start >= bi->end)
|
||||
numa_remove_memblk_from(i--, mi);
|
||||
}
|
||||
|
||||
@ -881,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node);
|
||||
|
||||
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int memory_add_physaddr_to_nid(u64 start)
|
||||
#ifdef CONFIG_NUMA_KEEP_MEMINFO
|
||||
static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
|
||||
{
|
||||
struct numa_meminfo *mi = &numa_meminfo;
|
||||
int nid = mi->blk[0].nid;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < mi->nr_blks; i++)
|
||||
if (mi->blk[i].start <= start && mi->blk[i].end > start)
|
||||
nid = mi->blk[i].nid;
|
||||
return mi->blk[i].nid;
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
int phys_to_target_node(phys_addr_t start)
|
||||
{
|
||||
int nid = meminfo_to_nid(&numa_meminfo, start);
|
||||
|
||||
/*
|
||||
* Prefer online nodes, but if reserved memory might be
|
||||
* hot-added continue the search with reserved ranges.
|
||||
*/
|
||||
if (nid != NUMA_NO_NODE)
|
||||
return nid;
|
||||
|
||||
return meminfo_to_nid(&numa_reserved_meminfo, start);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(phys_to_target_node);
|
||||
|
||||
int memory_add_physaddr_to_nid(u64 start)
|
||||
{
|
||||
int nid = meminfo_to_nid(&numa_meminfo, start);
|
||||
|
||||
if (nid == NUMA_NO_NODE)
|
||||
nid = numa_meminfo.blk[0].nid;
|
||||
return nid;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||
|
@ -360,7 +360,7 @@ static union acpi_object *acpi_label_info(acpi_handle handle)
|
||||
|
||||
static u8 nfit_dsm_revid(unsigned family, unsigned func)
|
||||
{
|
||||
static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
|
||||
static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = {
|
||||
[NVDIMM_FAMILY_INTEL] = {
|
||||
[NVDIMM_INTEL_GET_MODES] = 2,
|
||||
[NVDIMM_INTEL_GET_FWINFO] = 2,
|
||||
@ -386,7 +386,7 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func)
|
||||
|
||||
if (family > NVDIMM_FAMILY_MAX)
|
||||
return 0;
|
||||
if (func > 31)
|
||||
if (func > NVDIMM_CMD_MAX)
|
||||
return 0;
|
||||
id = revid_table[family][func];
|
||||
if (id == 0)
|
||||
@ -492,7 +492,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
|
||||
* Check for a valid command. For ND_CMD_CALL, we also have to
|
||||
* make sure that the DSM function is supported.
|
||||
*/
|
||||
if (cmd == ND_CMD_CALL && !test_bit(func, &dsm_mask))
|
||||
if (cmd == ND_CMD_CALL &&
|
||||
(func > NVDIMM_CMD_MAX || !test_bit(func, &dsm_mask)))
|
||||
return -ENOTTY;
|
||||
else if (!test_bit(cmd, &cmd_mask))
|
||||
return -ENOTTY;
|
||||
@ -2026,8 +2027,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nfit_mem->bdw && nfit_mem->memdev_pmem)
|
||||
if (nfit_mem->bdw && nfit_mem->memdev_pmem) {
|
||||
set_bit(NDD_ALIASING, &flags);
|
||||
set_bit(NDD_LABELING, &flags);
|
||||
}
|
||||
|
||||
/* collate flags across all memdevs for this dimm */
|
||||
list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
|
||||
@ -3492,7 +3495,8 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
|
||||
if (nvdimm && cmd == ND_CMD_CALL &&
|
||||
call_pkg->nd_family == NVDIMM_FAMILY_INTEL) {
|
||||
func = call_pkg->nd_command;
|
||||
if ((1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
|
||||
if (func > NVDIMM_CMD_MAX ||
|
||||
(1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
|
@ -34,6 +34,7 @@
|
||||
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
|
||||
|
||||
#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
|
||||
#define NVDIMM_CMD_MAX 31
|
||||
|
||||
#define NVDIMM_STANDARD_CMDMASK \
|
||||
(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
|
||||
@ -144,32 +145,32 @@ struct nfit_spa {
|
||||
unsigned long ars_state;
|
||||
u32 clear_err_unit;
|
||||
u32 max_ars;
|
||||
struct acpi_nfit_system_address spa[0];
|
||||
struct acpi_nfit_system_address spa[];
|
||||
};
|
||||
|
||||
struct nfit_dcr {
|
||||
struct list_head list;
|
||||
struct acpi_nfit_control_region dcr[0];
|
||||
struct acpi_nfit_control_region dcr[];
|
||||
};
|
||||
|
||||
struct nfit_bdw {
|
||||
struct list_head list;
|
||||
struct acpi_nfit_data_region bdw[0];
|
||||
struct acpi_nfit_data_region bdw[];
|
||||
};
|
||||
|
||||
struct nfit_idt {
|
||||
struct list_head list;
|
||||
struct acpi_nfit_interleave idt[0];
|
||||
struct acpi_nfit_interleave idt[];
|
||||
};
|
||||
|
||||
struct nfit_flush {
|
||||
struct list_head list;
|
||||
struct acpi_nfit_flush_address flush[0];
|
||||
struct acpi_nfit_flush_address flush[];
|
||||
};
|
||||
|
||||
struct nfit_memdev {
|
||||
struct list_head list;
|
||||
struct acpi_nfit_memory_map memdev[0];
|
||||
struct acpi_nfit_memory_map memdev[];
|
||||
};
|
||||
|
||||
enum nfit_mem_flags {
|
||||
|
@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm)
|
||||
}
|
||||
EXPORT_SYMBOL(acpi_map_pxm_to_node);
|
||||
|
||||
/**
|
||||
* acpi_map_pxm_to_online_node - Map proximity ID to online node
|
||||
* @pxm: ACPI proximity ID
|
||||
*
|
||||
* This is similar to acpi_map_pxm_to_node(), but always returns an online
|
||||
* node. When the mapped node from a given proximity ID is offline, it
|
||||
* looks up the node distance table and returns the nearest online node.
|
||||
*
|
||||
* ACPI device drivers, which are called after the NUMA initialization has
|
||||
* completed in the kernel, can call this interface to obtain their device
|
||||
* NUMA topology from ACPI tables. Such drivers do not have to deal with
|
||||
* offline nodes. A node may be offline when a device proximity ID is
|
||||
* unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
|
||||
* "numa=off" on x86.
|
||||
*/
|
||||
int acpi_map_pxm_to_online_node(int pxm)
|
||||
{
|
||||
int node, min_node;
|
||||
|
||||
node = acpi_map_pxm_to_node(pxm);
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = 0;
|
||||
|
||||
min_node = node;
|
||||
if (!node_online(node)) {
|
||||
int min_dist = INT_MAX, dist, n;
|
||||
|
||||
for_each_online_node(n) {
|
||||
dist = node_distance(node, n);
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
min_node = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return min_node;
|
||||
}
|
||||
EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
|
||||
|
||||
static void __init
|
||||
acpi_table_print_srat_entry(struct acpi_subtable_header *header)
|
||||
{
|
||||
|
@ -421,8 +421,10 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
|
||||
* device outside of mmap of the resulting character device.
|
||||
*/
|
||||
dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
|
||||
if (!dax_dev)
|
||||
if (IS_ERR(dax_dev)) {
|
||||
rc = PTR_ERR(dax_dev);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* a device_dax instance is dead while the driver is not attached */
|
||||
kill_dax(dax_dev);
|
||||
|
@ -344,6 +344,23 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_copy_to_iter);
|
||||
|
||||
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
if (!dax_alive(dax_dev))
|
||||
return -ENXIO;
|
||||
/*
|
||||
* There are no callers that want to zero more than one page as of now.
|
||||
* Once users are there, this check can be removed after the
|
||||
* device mapper code has been updated to split ranges across targets.
|
||||
*/
|
||||
if (nr_pages != 1)
|
||||
return -EIO;
|
||||
|
||||
return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_zero_page_range);
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PMEM_API
|
||||
void arch_wb_cache_pmem(void *addr, size_t size);
|
||||
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
|
||||
@ -551,9 +568,16 @@ struct dax_device *alloc_dax(void *private, const char *__host,
|
||||
dev_t devt;
|
||||
int minor;
|
||||
|
||||
if (ops && !ops->zero_page_range) {
|
||||
pr_debug("%s: error: device does not provide dax"
|
||||
" operation zero_page_range()\n",
|
||||
__host ? __host : "Unknown");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
host = kstrdup(__host, GFP_KERNEL);
|
||||
if (__host && !host)
|
||||
return NULL;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
|
||||
if (minor < 0)
|
||||
@ -576,7 +600,7 @@ struct dax_device *alloc_dax(void *private, const char *__host,
|
||||
ida_simple_remove(&dax_minor_ida, minor);
|
||||
err_minor:
|
||||
kfree(host);
|
||||
return NULL;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(alloc_dax);
|
||||
|
||||
|
@ -201,10 +201,27 @@ static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
|
||||
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
int ret;
|
||||
struct linear_c *lc = ti->private;
|
||||
struct block_device *bdev = lc->dev->bdev;
|
||||
struct dax_device *dax_dev = lc->dev->dax_dev;
|
||||
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
|
||||
|
||||
dev_sector = linear_map_sector(ti, sector);
|
||||
ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
|
||||
if (ret)
|
||||
return ret;
|
||||
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
|
||||
}
|
||||
|
||||
#else
|
||||
#define linear_dax_direct_access NULL
|
||||
#define linear_dax_copy_from_iter NULL
|
||||
#define linear_dax_copy_to_iter NULL
|
||||
#define linear_dax_zero_page_range NULL
|
||||
#endif
|
||||
|
||||
static struct target_type linear_target = {
|
||||
@ -226,6 +243,7 @@ static struct target_type linear_target = {
|
||||
.direct_access = linear_dax_direct_access,
|
||||
.dax_copy_from_iter = linear_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = linear_dax_copy_to_iter,
|
||||
.dax_zero_page_range = linear_dax_zero_page_range,
|
||||
};
|
||||
|
||||
int __init dm_linear_init(void)
|
||||
|
@ -994,10 +994,26 @@ static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
|
||||
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
int ret;
|
||||
struct log_writes_c *lc = ti->private;
|
||||
sector_t sector = pgoff * PAGE_SECTORS;
|
||||
|
||||
ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
|
||||
&pgoff);
|
||||
if (ret)
|
||||
return ret;
|
||||
return dax_zero_page_range(lc->dev->dax_dev, pgoff,
|
||||
nr_pages << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
#else
|
||||
#define log_writes_dax_direct_access NULL
|
||||
#define log_writes_dax_copy_from_iter NULL
|
||||
#define log_writes_dax_copy_to_iter NULL
|
||||
#define log_writes_dax_zero_page_range NULL
|
||||
#endif
|
||||
|
||||
static struct target_type log_writes_target = {
|
||||
@ -1016,6 +1032,7 @@ static struct target_type log_writes_target = {
|
||||
.direct_access = log_writes_dax_direct_access,
|
||||
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
|
||||
.dax_zero_page_range = log_writes_dax_zero_page_range,
|
||||
};
|
||||
|
||||
static int __init dm_log_writes_init(void)
|
||||
|
@ -360,10 +360,32 @@ static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
|
||||
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
int ret;
|
||||
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
|
||||
struct stripe_c *sc = ti->private;
|
||||
struct dax_device *dax_dev;
|
||||
struct block_device *bdev;
|
||||
uint32_t stripe;
|
||||
|
||||
stripe_map_sector(sc, sector, &stripe, &dev_sector);
|
||||
dev_sector += sc->stripe[stripe].physical_start;
|
||||
dax_dev = sc->stripe[stripe].dev->dax_dev;
|
||||
bdev = sc->stripe[stripe].dev->bdev;
|
||||
|
||||
ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
|
||||
if (ret)
|
||||
return ret;
|
||||
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
|
||||
}
|
||||
|
||||
#else
|
||||
#define stripe_dax_direct_access NULL
|
||||
#define stripe_dax_copy_from_iter NULL
|
||||
#define stripe_dax_copy_to_iter NULL
|
||||
#define stripe_dax_zero_page_range NULL
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -486,6 +508,7 @@ static struct target_type stripe_target = {
|
||||
.direct_access = stripe_dax_direct_access,
|
||||
.dax_copy_from_iter = stripe_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = stripe_dax_copy_to_iter,
|
||||
.dax_zero_page_range = stripe_dax_zero_page_range,
|
||||
};
|
||||
|
||||
int __init dm_stripe_init(void)
|
||||
|
@ -1199,6 +1199,35 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
struct mapped_device *md = dax_get_private(dax_dev);
|
||||
sector_t sector = pgoff * PAGE_SECTORS;
|
||||
struct dm_target *ti;
|
||||
int ret = -EIO;
|
||||
int srcu_idx;
|
||||
|
||||
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
|
||||
|
||||
if (!ti)
|
||||
goto out;
|
||||
if (WARN_ON(!ti->type->dax_zero_page_range)) {
|
||||
/*
|
||||
* ->zero_page_range() is mandatory dax operation. If we are
|
||||
* here, something is wrong.
|
||||
*/
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
goto out;
|
||||
}
|
||||
ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
|
||||
|
||||
out:
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* A target may call dm_accept_partial_bio only from the map routine. It is
|
||||
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
|
||||
@ -1969,7 +1998,7 @@ static struct mapped_device *alloc_dev(int minor)
|
||||
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
|
||||
md->dax_dev = alloc_dax(md, md->disk->disk_name,
|
||||
&dm_dax_ops, 0);
|
||||
if (!md->dax_dev)
|
||||
if (IS_ERR(md->dax_dev))
|
||||
goto bad;
|
||||
}
|
||||
|
||||
@ -3200,6 +3229,7 @@ static const struct dax_operations dm_dax_ops = {
|
||||
.dax_supported = dm_dax_supported,
|
||||
.copy_from_iter = dm_dax_copy_from_iter,
|
||||
.copy_to_iter = dm_dax_copy_to_iter,
|
||||
.zero_page_range = dm_dax_zero_page_range,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -1042,8 +1042,10 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
if (!desc || (desc->out_num + desc->in_num == 0) ||
|
||||
!test_bit(cmd, &cmd_mask))
|
||||
if (!desc ||
|
||||
(desc->out_num + desc->in_num == 0) ||
|
||||
cmd > ND_CMD_CALL ||
|
||||
!test_bit(cmd, &cmd_mask))
|
||||
return -ENOTTY;
|
||||
|
||||
/* fail write commands (when read-only) */
|
||||
|
@ -99,7 +99,7 @@ static int nvdimm_probe(struct device *dev)
|
||||
if (ndd->ns_current >= 0) {
|
||||
rc = nd_label_reserve_dpa(ndd);
|
||||
if (rc == 0)
|
||||
nvdimm_set_aliasing(dev);
|
||||
nvdimm_set_labeling(dev);
|
||||
}
|
||||
nvdimm_bus_unlock(dev);
|
||||
|
||||
|
@ -32,7 +32,7 @@ int nvdimm_check_config_data(struct device *dev)
|
||||
|
||||
if (!nvdimm->cmd_mask ||
|
||||
!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
|
||||
if (test_bit(NDD_ALIASING, &nvdimm->flags))
|
||||
if (test_bit(NDD_LABELING, &nvdimm->flags))
|
||||
return -ENXIO;
|
||||
else
|
||||
return -ENOTTY;
|
||||
@ -173,11 +173,11 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
|
||||
return rc;
|
||||
}
|
||||
|
||||
void nvdimm_set_aliasing(struct device *dev)
|
||||
void nvdimm_set_labeling(struct device *dev)
|
||||
{
|
||||
struct nvdimm *nvdimm = to_nvdimm(dev);
|
||||
|
||||
set_bit(NDD_ALIASING, &nvdimm->flags);
|
||||
set_bit(NDD_LABELING, &nvdimm->flags);
|
||||
}
|
||||
|
||||
void nvdimm_set_locked(struct device *dev)
|
||||
@ -312,8 +312,9 @@ static ssize_t flags_show(struct device *dev,
|
||||
{
|
||||
struct nvdimm *nvdimm = to_nvdimm(dev);
|
||||
|
||||
return sprintf(buf, "%s%s\n",
|
||||
return sprintf(buf, "%s%s%s\n",
|
||||
test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
|
||||
test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
|
||||
test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
|
||||
}
|
||||
static DEVICE_ATTR_RO(flags);
|
||||
@ -562,6 +563,21 @@ int nvdimm_security_freeze(struct nvdimm *nvdimm)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static unsigned long dpa_align(struct nd_region *nd_region)
|
||||
{
|
||||
struct device *dev = &nd_region->dev;
|
||||
|
||||
if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev),
|
||||
"bus lock required for capacity provision\n"))
|
||||
return 0;
|
||||
if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align
|
||||
% nd_region->ndr_mappings,
|
||||
"invalid region align %#lx mappings: %d\n",
|
||||
nd_region->align, nd_region->ndr_mappings))
|
||||
return 0;
|
||||
return nd_region->align / nd_region->ndr_mappings;
|
||||
}
|
||||
|
||||
int alias_dpa_busy(struct device *dev, void *data)
|
||||
{
|
||||
resource_size_t map_end, blk_start, new;
|
||||
@ -570,6 +586,7 @@ int alias_dpa_busy(struct device *dev, void *data)
|
||||
struct nd_region *nd_region;
|
||||
struct nvdimm_drvdata *ndd;
|
||||
struct resource *res;
|
||||
unsigned long align;
|
||||
int i;
|
||||
|
||||
if (!is_memory(dev))
|
||||
@ -607,13 +624,21 @@ int alias_dpa_busy(struct device *dev, void *data)
|
||||
* Find the free dpa from the end of the last pmem allocation to
|
||||
* the end of the interleave-set mapping.
|
||||
*/
|
||||
align = dpa_align(nd_region);
|
||||
if (!align)
|
||||
return 0;
|
||||
|
||||
for_each_dpa_resource(ndd, res) {
|
||||
resource_size_t start, end;
|
||||
|
||||
if (strncmp(res->name, "pmem", 4) != 0)
|
||||
continue;
|
||||
if ((res->start >= blk_start && res->start < map_end)
|
||||
|| (res->end >= blk_start
|
||||
&& res->end <= map_end)) {
|
||||
new = max(blk_start, min(map_end + 1, res->end + 1));
|
||||
|
||||
start = ALIGN_DOWN(res->start, align);
|
||||
end = ALIGN(res->end + 1, align) - 1;
|
||||
if ((start >= blk_start && start < map_end)
|
||||
|| (end >= blk_start && end <= map_end)) {
|
||||
new = max(blk_start, min(map_end, end) + 1);
|
||||
if (new != blk_start) {
|
||||
blk_start = new;
|
||||
goto retry;
|
||||
@ -653,6 +678,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
|
||||
.res = NULL,
|
||||
};
|
||||
struct resource *res;
|
||||
unsigned long align;
|
||||
|
||||
if (!ndd)
|
||||
return 0;
|
||||
@ -660,10 +686,20 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
|
||||
device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
|
||||
|
||||
/* now account for busy blk allocations in unaliased dpa */
|
||||
align = dpa_align(nd_region);
|
||||
if (!align)
|
||||
return 0;
|
||||
for_each_dpa_resource(ndd, res) {
|
||||
resource_size_t start, end, size;
|
||||
|
||||
if (strncmp(res->name, "blk", 3) != 0)
|
||||
continue;
|
||||
info.available -= resource_size(res);
|
||||
start = ALIGN_DOWN(res->start, align);
|
||||
end = ALIGN(res->end + 1, align) - 1;
|
||||
size = end - start + 1;
|
||||
if (size >= info.available)
|
||||
return 0;
|
||||
info.available -= size;
|
||||
}
|
||||
|
||||
return info.available;
|
||||
@ -682,19 +718,31 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
|
||||
struct nvdimm_bus *nvdimm_bus;
|
||||
resource_size_t max = 0;
|
||||
struct resource *res;
|
||||
unsigned long align;
|
||||
|
||||
/* if a dimm is disabled the available capacity is zero */
|
||||
if (!ndd)
|
||||
return 0;
|
||||
|
||||
align = dpa_align(nd_region);
|
||||
if (!align)
|
||||
return 0;
|
||||
|
||||
nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
|
||||
if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm))
|
||||
return 0;
|
||||
for_each_dpa_resource(ndd, res) {
|
||||
resource_size_t start, end;
|
||||
|
||||
if (strcmp(res->name, "pmem-reserve") != 0)
|
||||
continue;
|
||||
if (resource_size(res) > max)
|
||||
max = resource_size(res);
|
||||
/* trim free space relative to current alignment setting */
|
||||
start = ALIGN(res->start, align);
|
||||
end = ALIGN_DOWN(res->end + 1, align) - 1;
|
||||
if (end < start)
|
||||
continue;
|
||||
if (end - start + 1 > max)
|
||||
max = end - start + 1;
|
||||
}
|
||||
release_free_pmem(nvdimm_bus, nd_mapping);
|
||||
return max;
|
||||
@ -722,24 +770,33 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
|
||||
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
|
||||
struct resource *res;
|
||||
const char *reason;
|
||||
unsigned long align;
|
||||
|
||||
if (!ndd)
|
||||
return 0;
|
||||
|
||||
align = dpa_align(nd_region);
|
||||
if (!align)
|
||||
return 0;
|
||||
|
||||
map_start = nd_mapping->start;
|
||||
map_end = map_start + nd_mapping->size - 1;
|
||||
blk_start = max(map_start, map_end + 1 - *overlap);
|
||||
for_each_dpa_resource(ndd, res) {
|
||||
if (res->start >= map_start && res->start < map_end) {
|
||||
resource_size_t start, end;
|
||||
|
||||
start = ALIGN_DOWN(res->start, align);
|
||||
end = ALIGN(res->end + 1, align) - 1;
|
||||
if (start >= map_start && start < map_end) {
|
||||
if (strncmp(res->name, "blk", 3) == 0)
|
||||
blk_start = min(blk_start,
|
||||
max(map_start, res->start));
|
||||
else if (res->end > map_end) {
|
||||
max(map_start, start));
|
||||
else if (end > map_end) {
|
||||
reason = "misaligned to iset";
|
||||
goto err;
|
||||
} else
|
||||
busy += resource_size(res);
|
||||
} else if (res->end >= map_start && res->end <= map_end) {
|
||||
busy += end - start + 1;
|
||||
} else if (end >= map_start && end <= map_end) {
|
||||
if (strncmp(res->name, "blk", 3) == 0) {
|
||||
/*
|
||||
* If a BLK allocation overlaps the start of
|
||||
@ -748,8 +805,8 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
|
||||
*/
|
||||
blk_start = map_start;
|
||||
} else
|
||||
busy += resource_size(res);
|
||||
} else if (map_start > res->start && map_start < res->end) {
|
||||
busy += end - start + 1;
|
||||
} else if (map_start > start && map_start < end) {
|
||||
/* total eclipse of the mapping */
|
||||
busy += nd_mapping->size;
|
||||
blk_start = map_start;
|
||||
@ -759,7 +816,7 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
|
||||
*overlap = map_end + 1 - blk_start;
|
||||
available = blk_start - map_start;
|
||||
if (busy < available)
|
||||
return available - busy;
|
||||
return ALIGN_DOWN(available - busy, align);
|
||||
return 0;
|
||||
|
||||
err:
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/libnvdimm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/numa.h>
|
||||
|
||||
static int e820_pmem_remove(struct platform_device *pdev)
|
||||
{
|
||||
@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static int e820_range_to_nid(resource_size_t addr)
|
||||
{
|
||||
return memory_add_physaddr_to_nid(addr);
|
||||
}
|
||||
#else
|
||||
static int e820_range_to_nid(resource_size_t addr)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int e820_register_one(struct resource *res, void *data)
|
||||
{
|
||||
struct nd_region_desc ndr_desc;
|
||||
struct nvdimm_bus *nvdimm_bus = data;
|
||||
int nid = phys_to_target_node(res->start);
|
||||
|
||||
memset(&ndr_desc, 0, sizeof(ndr_desc));
|
||||
ndr_desc.res = res;
|
||||
ndr_desc.numa_node = e820_range_to_nid(res->start);
|
||||
ndr_desc.target_node = ndr_desc.numa_node;
|
||||
ndr_desc.numa_node = numa_map_to_online_node(nid);
|
||||
ndr_desc.target_node = nid;
|
||||
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
|
||||
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
|
||||
return -ENXIO;
|
||||
|
@ -62,7 +62,7 @@ struct nd_namespace_index {
|
||||
__le16 major;
|
||||
__le16 minor;
|
||||
__le64 checksum;
|
||||
u8 free[0];
|
||||
u8 free[];
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/nd.h>
|
||||
#include "nd-core.h"
|
||||
#include "pmem.h"
|
||||
#include "pfn.h"
|
||||
#include "nd.h"
|
||||
|
||||
static void namespace_io_release(struct device *dev)
|
||||
@ -541,6 +542,11 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
|
||||
{
|
||||
bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
|
||||
bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
|
||||
unsigned long align;
|
||||
|
||||
align = nd_region->align / nd_region->ndr_mappings;
|
||||
valid->start = ALIGN(valid->start, align);
|
||||
valid->end = ALIGN_DOWN(valid->end + 1, align) - 1;
|
||||
|
||||
if (valid->start >= valid->end)
|
||||
goto invalid;
|
||||
@ -980,10 +986,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, &remainder);
|
||||
div_u64_rem(val, nd_region->align, &remainder);
|
||||
if (remainder) {
|
||||
dev_dbg(dev, "%llu is not %ldK aligned\n", val,
|
||||
(PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K);
|
||||
nd_region->align / SZ_1K);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -1739,6 +1745,22 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
|
||||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note, alignment validation for fsdax and devdax mode
|
||||
* namespaces happens in nd_pfn_validate() where infoblock
|
||||
* padding parameters can be applied.
|
||||
*/
|
||||
if (pmem_should_map_pages(dev)) {
|
||||
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
|
||||
struct resource *res = &nsio->res;
|
||||
|
||||
if (!IS_ALIGNED(res->start | (res->end + 1),
|
||||
memremap_compat_align())) {
|
||||
dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res);
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_namespace_pmem(&ndns->dev)) {
|
||||
struct nd_namespace_pmem *nspm;
|
||||
|
||||
@ -2521,7 +2543,7 @@ static int init_active_labels(struct nd_region *nd_region)
|
||||
if (!ndd) {
|
||||
if (test_bit(NDD_LOCKED, &nvdimm->flags))
|
||||
/* fail, label data may be unreadable */;
|
||||
else if (test_bit(NDD_ALIASING, &nvdimm->flags))
|
||||
else if (test_bit(NDD_LABELING, &nvdimm->flags))
|
||||
/* fail, labels needed to disambiguate dpa */;
|
||||
else
|
||||
return 0;
|
||||
|
@ -39,7 +39,7 @@ struct nd_region_data {
|
||||
int ns_count;
|
||||
int ns_active;
|
||||
unsigned int hints_shift;
|
||||
void __iomem *flush_wpq[0];
|
||||
void __iomem *flush_wpq[];
|
||||
};
|
||||
|
||||
static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd,
|
||||
@ -146,6 +146,7 @@ struct nd_region {
|
||||
struct device *btt_seed;
|
||||
struct device *pfn_seed;
|
||||
struct device *dax_seed;
|
||||
unsigned long align;
|
||||
u16 ndr_mappings;
|
||||
u64 ndr_size;
|
||||
u64 ndr_start;
|
||||
@ -156,7 +157,7 @@ struct nd_region {
|
||||
struct nd_interleave_set *nd_set;
|
||||
struct nd_percpu_lane __percpu *lane;
|
||||
int (*flush)(struct nd_region *nd_region, struct bio *bio);
|
||||
struct nd_mapping mapping[0];
|
||||
struct nd_mapping mapping[];
|
||||
};
|
||||
|
||||
struct nd_blk_region {
|
||||
@ -252,7 +253,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
|
||||
void *buf, size_t len);
|
||||
long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
|
||||
unsigned int len);
|
||||
void nvdimm_set_aliasing(struct device *dev);
|
||||
void nvdimm_set_labeling(struct device *dev);
|
||||
void nvdimm_set_locked(struct device *dev);
|
||||
void nvdimm_clear_locked(struct device *dev);
|
||||
int nvdimm_security_setup_events(struct device *dev);
|
||||
|
@ -62,8 +62,10 @@ static int of_pmem_region_probe(struct platform_device *pdev)
|
||||
|
||||
if (is_volatile)
|
||||
region = nvdimm_volatile_region_create(bus, &ndr_desc);
|
||||
else
|
||||
else {
|
||||
set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
|
||||
region = nvdimm_pmem_region_create(bus, &ndr_desc);
|
||||
}
|
||||
|
||||
if (!region)
|
||||
dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n",
|
||||
|
@ -24,6 +24,18 @@ struct nd_pfn_sb {
|
||||
__le64 npfns;
|
||||
__le32 mode;
|
||||
/* minor-version-1 additions for section alignment */
|
||||
/**
|
||||
* @start_pad: Deprecated attribute to pad start-misaligned namespaces
|
||||
*
|
||||
* start_pad is deprecated because the original definition did
|
||||
* not comprehend that dataoff is relative to the base address
|
||||
* of the namespace not the start_pad adjusted base. The result
|
||||
* is that the dax path is broken, but the block-I/O path is
|
||||
* not. The kernel will no longer create namespaces using start
|
||||
* padding, but it still supports block-I/O for legacy
|
||||
* configurations mainly to allow a backup, reconfigure the
|
||||
* namespace, and restore flow to repair dax operation.
|
||||
*/
|
||||
__le32 start_pad;
|
||||
__le32 end_trunc;
|
||||
/* minor-version-2 record the base alignment of the mapping */
|
||||
|
@ -446,6 +446,7 @@ static bool nd_supported_alignment(unsigned long align)
|
||||
int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
|
||||
{
|
||||
u64 checksum, offset;
|
||||
struct resource *res;
|
||||
enum nd_pfn_mode mode;
|
||||
struct nd_namespace_io *nsio;
|
||||
unsigned long align, start_pad;
|
||||
@ -561,14 +562,14 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
|
||||
dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n",
|
||||
nd_pfn->align, align, nd_pfn->mode,
|
||||
mode);
|
||||
return -EINVAL;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
}
|
||||
|
||||
if (align > nvdimm_namespace_capacity(ndns)) {
|
||||
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
|
||||
align, nvdimm_namespace_capacity(ndns));
|
||||
return -EINVAL;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -578,18 +579,31 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
|
||||
* established.
|
||||
*/
|
||||
nsio = to_nd_namespace_io(&ndns->dev);
|
||||
if (offset >= resource_size(&nsio->res)) {
|
||||
res = &nsio->res;
|
||||
if (offset >= resource_size(res)) {
|
||||
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
|
||||
dev_name(&ndns->dev));
|
||||
return -EBUSY;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
|
||||
if ((align && !IS_ALIGNED(res->start + offset + start_pad, align))
|
||||
|| !IS_ALIGNED(offset, PAGE_SIZE)) {
|
||||
dev_err(&nd_pfn->dev,
|
||||
"bad offset: %#llx dax disabled align: %#lx\n",
|
||||
offset, align);
|
||||
return -ENXIO;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
if (!IS_ALIGNED(res->start + le32_to_cpu(pfn_sb->start_pad),
|
||||
memremap_compat_align())) {
|
||||
dev_err(&nd_pfn->dev, "resource start misaligned\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
if (!IS_ALIGNED(res->end + 1 - le32_to_cpu(pfn_sb->end_trunc),
|
||||
memremap_compat_align())) {
|
||||
dev_err(&nd_pfn->dev, "resource end misaligned\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -750,7 +764,19 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
|
||||
start = nsio->res.start;
|
||||
size = resource_size(&nsio->res);
|
||||
npfns = PHYS_PFN(size - SZ_8K);
|
||||
align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
|
||||
align = max(nd_pfn->align, memremap_compat_align());
|
||||
|
||||
/*
|
||||
* When @start is misaligned fail namespace creation. See
|
||||
* the 'struct nd_pfn_sb' commentary on why ->start_pad is not
|
||||
* an option.
|
||||
*/
|
||||
if (!IS_ALIGNED(start, memremap_compat_align())) {
|
||||
dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n",
|
||||
dev_name(&ndns->dev), &start,
|
||||
memremap_compat_align());
|
||||
return -EINVAL;
|
||||
}
|
||||
end_trunc = start + size - ALIGN_DOWN(start + size, align);
|
||||
if (nd_pfn->mode == PFN_MODE_PMEM) {
|
||||
/*
|
||||
|
@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
unsigned int len, unsigned int off, unsigned int op,
|
||||
sector_t sector)
|
||||
static blk_status_t pmem_do_read(struct pmem_device *pmem,
|
||||
struct page *page, unsigned int page_off,
|
||||
sector_t sector, unsigned int len)
|
||||
{
|
||||
blk_status_t rc;
|
||||
phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
|
||||
void *pmem_addr = pmem->virt_addr + pmem_off;
|
||||
|
||||
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
rc = read_pmem(page, page_off, pmem_addr, len);
|
||||
flush_dcache_page(page);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static blk_status_t pmem_do_write(struct pmem_device *pmem,
|
||||
struct page *page, unsigned int page_off,
|
||||
sector_t sector, unsigned int len)
|
||||
{
|
||||
blk_status_t rc = BLK_STS_OK;
|
||||
bool bad_pmem = false;
|
||||
@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
||||
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
|
||||
bad_pmem = true;
|
||||
|
||||
if (!op_is_write(op)) {
|
||||
if (unlikely(bad_pmem))
|
||||
rc = BLK_STS_IOERR;
|
||||
else {
|
||||
rc = read_pmem(page, off, pmem_addr, len);
|
||||
flush_dcache_page(page);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Note that we write the data both before and after
|
||||
* clearing poison. The write before clear poison
|
||||
* handles situations where the latest written data is
|
||||
* preserved and the clear poison operation simply marks
|
||||
* the address range as valid without changing the data.
|
||||
* In this case application software can assume that an
|
||||
* interrupted write will either return the new good
|
||||
* data or an error.
|
||||
*
|
||||
* However, if pmem_clear_poison() leaves the data in an
|
||||
* indeterminate state we need to perform the write
|
||||
* after clear poison.
|
||||
*/
|
||||
flush_dcache_page(page);
|
||||
write_pmem(pmem_addr, page, off, len);
|
||||
if (unlikely(bad_pmem)) {
|
||||
rc = pmem_clear_poison(pmem, pmem_off, len);
|
||||
write_pmem(pmem_addr, page, off, len);
|
||||
}
|
||||
/*
|
||||
* Note that we write the data both before and after
|
||||
* clearing poison. The write before clear poison
|
||||
* handles situations where the latest written data is
|
||||
* preserved and the clear poison operation simply marks
|
||||
* the address range as valid without changing the data.
|
||||
* In this case application software can assume that an
|
||||
* interrupted write will either return the new good
|
||||
* data or an error.
|
||||
*
|
||||
* However, if pmem_clear_poison() leaves the data in an
|
||||
* indeterminate state we need to perform the write
|
||||
* after clear poison.
|
||||
*/
|
||||
flush_dcache_page(page);
|
||||
write_pmem(pmem_addr, page, page_off, len);
|
||||
if (unlikely(bad_pmem)) {
|
||||
rc = pmem_clear_poison(pmem, pmem_off, len);
|
||||
write_pmem(pmem_addr, page, page_off, len);
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
|
||||
|
||||
do_acct = nd_iostat_start(bio, &start);
|
||||
bio_for_each_segment(bvec, bio, iter) {
|
||||
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
|
||||
bvec.bv_offset, bio_op(bio), iter.bi_sector);
|
||||
if (op_is_write(bio_op(bio)))
|
||||
rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
|
||||
iter.bi_sector, bvec.bv_len);
|
||||
else
|
||||
rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
|
||||
iter.bi_sector, bvec.bv_len);
|
||||
if (rc) {
|
||||
bio->bi_status = rc;
|
||||
break;
|
||||
@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct pmem_device *pmem = bdev->bd_queue->queuedata;
|
||||
blk_status_t rc;
|
||||
|
||||
rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
|
||||
0, op, sector);
|
||||
|
||||
if (op_is_write(op))
|
||||
rc = pmem_do_write(pmem, page, 0, sector,
|
||||
hpage_nr_pages(page) * PAGE_SIZE);
|
||||
else
|
||||
rc = pmem_do_read(pmem, page, 0, sector,
|
||||
hpage_nr_pages(page) * PAGE_SIZE);
|
||||
/*
|
||||
* The ->rw_page interface is subtle and tricky. The core
|
||||
* retries on any error, so we can only invoke page_endio() in
|
||||
@ -268,6 +282,16 @@ static const struct block_device_operations pmem_fops = {
|
||||
.revalidate_disk = nvdimm_revalidate_disk,
|
||||
};
|
||||
|
||||
static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
size_t nr_pages)
|
||||
{
|
||||
struct pmem_device *pmem = dax_get_private(dax_dev);
|
||||
|
||||
return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
|
||||
PFN_PHYS(pgoff) >> SECTOR_SHIFT,
|
||||
PAGE_SIZE));
|
||||
}
|
||||
|
||||
static long pmem_dax_direct_access(struct dax_device *dax_dev,
|
||||
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
|
||||
{
|
||||
@ -299,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = {
|
||||
.dax_supported = generic_fsdax_supported,
|
||||
.copy_from_iter = pmem_copy_from_iter,
|
||||
.copy_to_iter = pmem_copy_to_iter,
|
||||
.zero_page_range = pmem_dax_zero_page_range,
|
||||
};
|
||||
|
||||
static const struct attribute_group *pmem_attribute_groups[] = {
|
||||
@ -461,9 +486,9 @@ static int pmem_attach_disk(struct device *dev,
|
||||
if (is_nvdimm_sync(nd_region))
|
||||
flags = DAXDEV_F_SYNC;
|
||||
dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
|
||||
if (!dax_dev) {
|
||||
if (IS_ERR(dax_dev)) {
|
||||
put_disk(disk);
|
||||
return -ENOMEM;
|
||||
return PTR_ERR(dax_dev);
|
||||
}
|
||||
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
|
||||
pmem->dax_dev = dax_dev;
|
||||
|
@ -195,16 +195,16 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
|
||||
int nd_region_to_nstype(struct nd_region *nd_region)
|
||||
{
|
||||
if (is_memory(&nd_region->dev)) {
|
||||
u16 i, alias;
|
||||
u16 i, label;
|
||||
|
||||
for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
|
||||
for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) {
|
||||
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
|
||||
struct nvdimm *nvdimm = nd_mapping->nvdimm;
|
||||
|
||||
if (test_bit(NDD_ALIASING, &nvdimm->flags))
|
||||
alias++;
|
||||
if (test_bit(NDD_LABELING, &nvdimm->flags))
|
||||
label++;
|
||||
}
|
||||
if (alias)
|
||||
if (label)
|
||||
return ND_DEVICE_NAMESPACE_PMEM;
|
||||
else
|
||||
return ND_DEVICE_NAMESPACE_IO;
|
||||
@ -216,21 +216,25 @@ int nd_region_to_nstype(struct nd_region *nd_region)
|
||||
}
|
||||
EXPORT_SYMBOL(nd_region_to_nstype);
|
||||
|
||||
static unsigned long long region_size(struct nd_region *nd_region)
|
||||
{
|
||||
if (is_memory(&nd_region->dev)) {
|
||||
return nd_region->ndr_size;
|
||||
} else if (nd_region->ndr_mappings == 1) {
|
||||
struct nd_mapping *nd_mapping = &nd_region->mapping[0];
|
||||
|
||||
return nd_mapping->size;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t size_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev);
|
||||
unsigned long long size = 0;
|
||||
|
||||
if (is_memory(dev)) {
|
||||
size = nd_region->ndr_size;
|
||||
} else if (nd_region->ndr_mappings == 1) {
|
||||
struct nd_mapping *nd_mapping = &nd_region->mapping[0];
|
||||
|
||||
size = nd_mapping->size;
|
||||
}
|
||||
|
||||
return sprintf(buf, "%llu\n", size);
|
||||
return sprintf(buf, "%llu\n", region_size(nd_region));
|
||||
}
|
||||
static DEVICE_ATTR_RO(size);
|
||||
|
||||
@ -529,6 +533,54 @@ static ssize_t read_only_store(struct device *dev,
|
||||
}
|
||||
static DEVICE_ATTR_RW(read_only);
|
||||
|
||||
static ssize_t align_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev);
|
||||
|
||||
return sprintf(buf, "%#lx\n", nd_region->align);
|
||||
}
|
||||
|
||||
static ssize_t align_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev);
|
||||
unsigned long val, dpa;
|
||||
u32 remainder;
|
||||
int rc;
|
||||
|
||||
rc = kstrtoul(buf, 0, &val);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
if (!nd_region->ndr_mappings)
|
||||
return -ENXIO;
|
||||
|
||||
/*
|
||||
* Ensure space-align is evenly divisible by the region
|
||||
* interleave-width because the kernel typically has no facility
|
||||
* to determine which DIMM(s), dimm-physical-addresses, would
|
||||
* contribute to the tail capacity in system-physical-address
|
||||
* space for the namespace.
|
||||
*/
|
||||
dpa = div_u64_rem(val, nd_region->ndr_mappings, &remainder);
|
||||
if (!is_power_of_2(dpa) || dpa < PAGE_SIZE
|
||||
|| val > region_size(nd_region) || remainder)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Given that space allocation consults this value multiple
|
||||
* times ensure it does not change for the duration of the
|
||||
* allocation.
|
||||
*/
|
||||
nvdimm_bus_lock(dev);
|
||||
nd_region->align = val;
|
||||
nvdimm_bus_unlock(dev);
|
||||
|
||||
return len;
|
||||
}
|
||||
static DEVICE_ATTR_RW(align);
|
||||
|
||||
static ssize_t region_badblocks_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
@ -571,6 +623,7 @@ static DEVICE_ATTR_RO(persistence_domain);
|
||||
|
||||
static struct attribute *nd_region_attributes[] = {
|
||||
&dev_attr_size.attr,
|
||||
&dev_attr_align.attr,
|
||||
&dev_attr_nstype.attr,
|
||||
&dev_attr_mappings.attr,
|
||||
&dev_attr_btt_seed.attr,
|
||||
@ -626,6 +679,19 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
|
||||
return a->mode;
|
||||
}
|
||||
|
||||
if (a == &dev_attr_align.attr) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nd_region->ndr_mappings; i++) {
|
||||
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
|
||||
struct nvdimm *nvdimm = nd_mapping->nvdimm;
|
||||
|
||||
if (test_bit(NDD_LABELING, &nvdimm->flags))
|
||||
return a->mode;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (a != &dev_attr_set_cookie.attr
|
||||
&& a != &dev_attr_available_size.attr)
|
||||
return a->mode;
|
||||
@ -935,6 +1001,41 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
|
||||
}
|
||||
EXPORT_SYMBOL(nd_region_release_lane);
|
||||
|
||||
/*
|
||||
* PowerPC requires this alignment for memremap_pages(). All other archs
|
||||
* should be ok with SUBSECTION_SIZE (see memremap_compat_align()).
|
||||
*/
|
||||
#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M
|
||||
|
||||
static unsigned long default_align(struct nd_region *nd_region)
|
||||
{
|
||||
unsigned long align;
|
||||
int i, mappings;
|
||||
u32 remainder;
|
||||
|
||||
if (is_nd_blk(&nd_region->dev))
|
||||
align = PAGE_SIZE;
|
||||
else
|
||||
align = MEMREMAP_COMPAT_ALIGN_MAX;
|
||||
|
||||
for (i = 0; i < nd_region->ndr_mappings; i++) {
|
||||
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
|
||||
struct nvdimm *nvdimm = nd_mapping->nvdimm;
|
||||
|
||||
if (test_bit(NDD_ALIASING, &nvdimm->flags)) {
|
||||
align = MEMREMAP_COMPAT_ALIGN_MAX;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mappings = max_t(u16, 1, nd_region->ndr_mappings);
|
||||
div_u64_rem(align, mappings, &remainder);
|
||||
if (remainder)
|
||||
align *= mappings;
|
||||
|
||||
return align;
|
||||
}
|
||||
|
||||
static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
|
||||
struct nd_region_desc *ndr_desc,
|
||||
const struct device_type *dev_type, const char *caller)
|
||||
@ -1039,6 +1140,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
|
||||
dev->of_node = ndr_desc->of_node;
|
||||
nd_region->ndr_size = resource_size(ndr_desc->res);
|
||||
nd_region->ndr_start = ndr_desc->res->start;
|
||||
nd_region->align = default_align(nd_region);
|
||||
if (ndr_desc->flush)
|
||||
nd_region->flush = ndr_desc->flush;
|
||||
else
|
||||
|
@ -57,11 +57,26 @@ static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
|
||||
return copy_to_iter(addr, bytes, i);
|
||||
}
|
||||
|
||||
static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,
|
||||
pgoff_t pgoff, size_t nr_pages)
|
||||
{
|
||||
long rc;
|
||||
void *kaddr;
|
||||
|
||||
rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
memset(kaddr, 0, nr_pages << PAGE_SHIFT);
|
||||
dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct dax_operations dcssblk_dax_ops = {
|
||||
.direct_access = dcssblk_dax_direct_access,
|
||||
.dax_supported = generic_fsdax_supported,
|
||||
.copy_from_iter = dcssblk_dax_copy_from_iter,
|
||||
.copy_to_iter = dcssblk_dax_copy_to_iter,
|
||||
.zero_page_range = dcssblk_dax_zero_page_range,
|
||||
};
|
||||
|
||||
struct dcssblk_dev_info {
|
||||
@ -680,8 +695,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
|
||||
|
||||
dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
|
||||
&dcssblk_dax_ops, DAXDEV_F_SYNC);
|
||||
if (!dev_info->dax_dev) {
|
||||
rc = -ENOMEM;
|
||||
if (IS_ERR(dev_info->dax_dev)) {
|
||||
rc = PTR_ERR(dev_info->dax_dev);
|
||||
dev_info->dax_dev = NULL;
|
||||
goto put_dev;
|
||||
}
|
||||
|
||||
|
61
fs/dax.c
61
fs/dax.c
@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool dax_range_is_aligned(struct block_device *bdev,
|
||||
unsigned int offset, unsigned int length)
|
||||
int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
|
||||
struct iomap *iomap)
|
||||
{
|
||||
unsigned short sector_size = bdev_logical_block_size(bdev);
|
||||
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
|
||||
pgoff_t pgoff;
|
||||
long rc, id;
|
||||
void *kaddr;
|
||||
bool page_aligned = false;
|
||||
|
||||
if (!IS_ALIGNED(offset, sector_size))
|
||||
return false;
|
||||
if (!IS_ALIGNED(length, sector_size))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
|
||||
IS_ALIGNED(size, PAGE_SIZE))
|
||||
page_aligned = true;
|
||||
|
||||
int __dax_zero_page_range(struct block_device *bdev,
|
||||
struct dax_device *dax_dev, sector_t sector,
|
||||
unsigned int offset, unsigned int size)
|
||||
{
|
||||
if (dax_range_is_aligned(bdev, offset, size)) {
|
||||
sector_t start_sector = sector + (offset >> 9);
|
||||
rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return blkdev_issue_zeroout(bdev, start_sector,
|
||||
size >> 9, GFP_NOFS, 0);
|
||||
} else {
|
||||
pgoff_t pgoff;
|
||||
long rc, id;
|
||||
void *kaddr;
|
||||
id = dax_read_lock();
|
||||
|
||||
rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
id = dax_read_lock();
|
||||
rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
|
||||
if (rc < 0) {
|
||||
dax_read_unlock(id);
|
||||
return rc;
|
||||
}
|
||||
memset(kaddr + offset, 0, size);
|
||||
dax_flush(dax_dev, kaddr + offset, size);
|
||||
if (page_aligned)
|
||||
rc = dax_zero_page_range(iomap->dax_dev, pgoff,
|
||||
size >> PAGE_SHIFT);
|
||||
else
|
||||
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
|
||||
if (rc < 0) {
|
||||
dax_read_unlock(id);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (!page_aligned) {
|
||||
memset(kaddr + offset, 0, size);
|
||||
dax_flush(iomap->dax_dev, kaddr + offset, size);
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
|
||||
|
||||
static loff_t
|
||||
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
|
@ -975,13 +975,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
|
||||
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
|
||||
}
|
||||
|
||||
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
|
||||
struct iomap *iomap)
|
||||
{
|
||||
return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
|
||||
iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
|
||||
}
|
||||
|
||||
static loff_t
|
||||
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
@ -1001,7 +994,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
|
||||
bytes = min_t(loff_t, PAGE_SIZE - offset, count);
|
||||
|
||||
if (IS_DAX(inode))
|
||||
status = iomap_dax_zero(pos, offset, bytes, iomap);
|
||||
status = dax_iomap_zero(pos, offset, bytes, iomap);
|
||||
else
|
||||
status = iomap_zero(inode, pos, offset, bytes, iomap,
|
||||
srcmap);
|
||||
|
@ -416,9 +416,30 @@ extern void acpi_osi_setup(char *str);
|
||||
extern bool acpi_osi_is_win8(void);
|
||||
|
||||
#ifdef CONFIG_ACPI_NUMA
|
||||
int acpi_map_pxm_to_online_node(int pxm);
|
||||
int acpi_map_pxm_to_node(int pxm);
|
||||
int acpi_get_node(acpi_handle handle);
|
||||
|
||||
/**
|
||||
* acpi_map_pxm_to_online_node - Map proximity ID to online node
|
||||
* @pxm: ACPI proximity ID
|
||||
*
|
||||
* This is similar to acpi_map_pxm_to_node(), but always returns an online
|
||||
* node. When the mapped node from a given proximity ID is offline, it
|
||||
* looks up the node distance table and returns the nearest online node.
|
||||
*
|
||||
* ACPI device drivers, which are called after the NUMA initialization has
|
||||
* completed in the kernel, can call this interface to obtain their device
|
||||
* NUMA topology from ACPI tables. Such drivers do not have to deal with
|
||||
* offline nodes. A node may be offline when a device proximity ID is
|
||||
* unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
|
||||
* "numa=off" on x86.
|
||||
*/
|
||||
static inline int acpi_map_pxm_to_online_node(int pxm)
|
||||
{
|
||||
int node = acpi_map_pxm_to_node(pxm);
|
||||
|
||||
return numa_map_to_online_node(node);
|
||||
}
|
||||
#else
|
||||
static inline int acpi_map_pxm_to_online_node(int pxm)
|
||||
{
|
||||
|
@ -13,6 +13,7 @@
|
||||
typedef unsigned long dax_entry_t;
|
||||
|
||||
struct iomap_ops;
|
||||
struct iomap;
|
||||
struct dax_device;
|
||||
struct dax_operations {
|
||||
/*
|
||||
@ -34,6 +35,8 @@ struct dax_operations {
|
||||
/* copy_to_iter: required operation for fs-dax direct-i/o */
|
||||
size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
|
||||
struct iov_iter *);
|
||||
/* zero_page_range: required operation. Zero page range */
|
||||
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
|
||||
};
|
||||
|
||||
extern struct attribute_group dax_attribute_group;
|
||||
@ -199,6 +202,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
size_t bytes, struct iov_iter *i);
|
||||
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
size_t bytes, struct iov_iter *i);
|
||||
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
size_t nr_pages);
|
||||
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
|
||||
|
||||
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
@ -210,20 +215,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
|
||||
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
|
||||
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
||||
pgoff_t index);
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
int __dax_zero_page_range(struct block_device *bdev,
|
||||
struct dax_device *dax_dev, sector_t sector,
|
||||
unsigned int offset, unsigned int length);
|
||||
#else
|
||||
static inline int __dax_zero_page_range(struct block_device *bdev,
|
||||
struct dax_device *dax_dev, sector_t sector,
|
||||
unsigned int offset, unsigned int length)
|
||||
{
|
||||
return -ENXIO;
|
||||
}
|
||||
#endif
|
||||
|
||||
int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
|
||||
struct iomap *iomap);
|
||||
static inline bool dax_mapping(struct address_space *mapping)
|
||||
{
|
||||
return mapping->host && IS_DAX(mapping->host);
|
||||
|
@ -141,6 +141,8 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
|
||||
long nr_pages, void **kaddr, pfn_t *pfn);
|
||||
typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i);
|
||||
typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
|
||||
size_t nr_pages);
|
||||
#define PAGE_SECTORS (PAGE_SIZE / 512)
|
||||
|
||||
void dm_error(const char *message);
|
||||
@ -195,6 +197,7 @@ struct target_type {
|
||||
dm_dax_direct_access_fn direct_access;
|
||||
dm_dax_copy_iter_fn dax_copy_from_iter;
|
||||
dm_dax_copy_iter_fn dax_copy_to_iter;
|
||||
dm_dax_zero_page_range_fn dax_zero_page_range;
|
||||
|
||||
/* For internal device-mapper use. */
|
||||
struct list_head list;
|
||||
|
@ -77,8 +77,6 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
|
||||
size_t size, unsigned long flags);
|
||||
void devm_memunmap(struct device *dev, void *addr);
|
||||
|
||||
void *__devm_memremap_pages(struct device *dev, struct resource *res);
|
||||
|
||||
#ifdef CONFIG_PCI
|
||||
/*
|
||||
* The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
|
||||
|
@ -37,6 +37,8 @@ enum {
|
||||
NDD_WORK_PENDING = 4,
|
||||
/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
|
||||
NDD_NOBLK = 5,
|
||||
/* dimm supports namespace labels */
|
||||
NDD_LABELING = 6,
|
||||
|
||||
/* need to set a limit somewhere, but yes, this is likely overkill */
|
||||
ND_IOCTL_MAX_BUFLEN = SZ_4M,
|
||||
|
@ -134,6 +134,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
||||
|
||||
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
|
||||
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
|
||||
unsigned long memremap_compat_align(void);
|
||||
#else
|
||||
static inline void *devm_memremap_pages(struct device *dev,
|
||||
struct dev_pagemap *pgmap)
|
||||
@ -167,6 +168,12 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
|
||||
unsigned long nr_pfns)
|
||||
{
|
||||
}
|
||||
|
||||
/* when memremap_pages() is disabled all archs can remap a single page */
|
||||
static inline unsigned long memremap_compat_align(void)
|
||||
{
|
||||
return PAGE_SIZE;
|
||||
}
|
||||
#endif /* CONFIG_ZONE_DEVICE */
|
||||
|
||||
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
|
||||
@ -174,4 +181,5 @@ static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
|
||||
if (pgmap)
|
||||
percpu_ref_put(pgmap->ref);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_MEMREMAP_H_ */
|
||||
|
@ -1127,6 +1127,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
|
||||
#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
|
||||
|
||||
#define SUBSECTION_SHIFT 21
|
||||
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)
|
||||
|
||||
#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
|
||||
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
|
||||
|
@ -1,7 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_NUMA_H
|
||||
#define _LINUX_NUMA_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#ifdef CONFIG_NODES_SHIFT
|
||||
#define NODES_SHIFT CONFIG_NODES_SHIFT
|
||||
@ -13,4 +13,32 @@
|
||||
|
||||
#define NUMA_NO_NODE (-1)
|
||||
|
||||
/* optionally keep NUMA memory info available post init */
|
||||
#ifdef CONFIG_NUMA_KEEP_MEMINFO
|
||||
#define __initdata_or_meminfo
|
||||
#else
|
||||
#define __initdata_or_meminfo __initdata
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/* Generic implementation available */
|
||||
int numa_map_to_online_node(int node);
|
||||
|
||||
/*
|
||||
* Optional architecture specific implementation, users need a "depends
|
||||
* on $ARCH"
|
||||
*/
|
||||
int phys_to_target_node(phys_addr_t addr);
|
||||
#else
|
||||
static inline int numa_map_to_online_node(int node)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
static inline int phys_to_target_node(phys_addr_t addr)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_NUMA_H */
|
||||
|
@ -615,6 +615,9 @@ config ARCH_HAS_PMEM_API
|
||||
config MEMREGION
|
||||
bool
|
||||
|
||||
config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
|
||||
bool
|
||||
|
||||
# use memcpy to implement user copies for nommu architectures
|
||||
config UACCESS_MEMCPY
|
||||
bool
|
||||
|
@ -139,6 +139,10 @@ config HAVE_FAST_GUP
|
||||
config ARCH_KEEP_MEMBLOCK
|
||||
bool
|
||||
|
||||
# Keep arch NUMA mapping infrastructure post-init.
|
||||
config NUMA_KEEP_MEMINFO
|
||||
bool
|
||||
|
||||
config MEMORY_ISOLATION
|
||||
bool
|
||||
|
||||
@ -154,6 +158,7 @@ config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
select NUMA_KEEP_MEMINFO if NUMA
|
||||
|
||||
config MEMORY_HOTPLUG_SPARSE
|
||||
def_bool y
|
||||
|
@ -127,6 +127,32 @@ static struct mempolicy default_policy = {
|
||||
|
||||
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
||||
|
||||
/**
|
||||
* numa_map_to_online_node - Find closest online node
|
||||
* @nid: Node id to start the search
|
||||
*
|
||||
* Lookup the next closest node by distance if @nid is not online.
|
||||
*/
|
||||
int numa_map_to_online_node(int node)
|
||||
{
|
||||
int min_dist = INT_MAX, dist, n, min_node;
|
||||
|
||||
if (node == NUMA_NO_NODE || node_online(node))
|
||||
return node;
|
||||
|
||||
min_node = node;
|
||||
for_each_online_node(n) {
|
||||
dist = node_distance(node, n);
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
min_node = n;
|
||||
}
|
||||
}
|
||||
|
||||
return min_node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(numa_map_to_online_node);
|
||||
|
||||
struct mempolicy *get_task_policy(struct task_struct *p)
|
||||
{
|
||||
struct mempolicy *pol = p->mempolicy;
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/wait_bit.h>
|
||||
@ -14,6 +15,28 @@
|
||||
|
||||
static DEFINE_XARRAY(pgmap_array);
|
||||
|
||||
/*
|
||||
* The memremap() and memremap_pages() interfaces are alternately used
|
||||
* to map persistent memory namespaces. These interfaces place different
|
||||
* constraints on the alignment and size of the mapping (namespace).
|
||||
* memremap() can map individual PAGE_SIZE pages. memremap_pages() can
|
||||
* only map subsections (2MB), and at least one architecture (PowerPC)
|
||||
* the minimum mapping granularity of memremap_pages() is 16MB.
|
||||
*
|
||||
* The role of memremap_compat_align() is to communicate the minimum
|
||||
* arch supported alignment of a namespace such that it can freely
|
||||
* switch modes without violating the arch constraint. Namely, do not
|
||||
* allow a namespace to be PAGE_SIZE aligned since that namespace may be
|
||||
* reconfigured into a mode that requires SUBSECTION_SIZE alignment.
|
||||
*/
|
||||
#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
|
||||
unsigned long memremap_compat_align(void)
|
||||
{
|
||||
return SUBSECTION_SIZE;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memremap_compat_align);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL(devmap_managed_key);
|
||||
|
@ -21,8 +21,8 @@ DRIVERS := ../../../drivers
|
||||
NVDIMM_SRC := $(DRIVERS)/nvdimm
|
||||
ACPI_SRC := $(DRIVERS)/acpi/nfit
|
||||
DAX_SRC := $(DRIVERS)/dax
|
||||
ccflags-y := -I$(src)/$(NVDIMM_SRC)/
|
||||
ccflags-y += -I$(src)/$(ACPI_SRC)/
|
||||
ccflags-y := -I$(srctree)/drivers/nvdimm/
|
||||
ccflags-y += -I$(srctree)/drivers/acpi/nfit/
|
||||
|
||||
obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
|
||||
obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
|
||||
|
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
ccflags-y := -I$(src)/../../../../drivers/nvdimm/
|
||||
ccflags-y += -I$(src)/../../../../drivers/acpi/nfit/
|
||||
ccflags-y := -I$(srctree)/drivers/nvdimm/
|
||||
ccflags-y += -I$(srctree)/drivers/acpi/nfit/
|
||||
|
||||
obj-m += nfit_test.o
|
||||
obj-m += nfit_test_iomap.o
|
||||
|
@ -3164,7 +3164,9 @@ static __init int nfit_test_init(void)
|
||||
mcsafe_test();
|
||||
dax_pmem_test();
|
||||
dax_pmem_core_test();
|
||||
#ifdef CONFIG_DEV_DAX_PMEM_COMPAT
|
||||
dax_pmem_compat_test();
|
||||
#endif
|
||||
|
||||
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user