372 lines
9.0 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
*/
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include "nd-core.h"
#include "btt.h"
#include "nd.h"
static void nd_btt_release(struct device *dev)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
struct nd_btt *nd_btt = to_nd_btt(dev);
dev_dbg(dev, "trace\n");
nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns);
ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
kfree(nd_btt->uuid);
kfree(nd_btt);
}
struct nd_btt *to_nd_btt(struct device *dev)
{
struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
WARN_ON(!is_nd_btt(dev));
return nd_btt;
}
EXPORT_SYMBOL(to_nd_btt);
static const unsigned long btt_lbasize_supported[] = { 512, 520, 528,
4096, 4104, 4160, 4224, 0 };
static ssize_t sector_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf);
}
static ssize_t sector_size_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
ssize_t rc;
device_lock(dev);
nvdimm_bus_lock(dev);
rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
btt_lbasize_supported);
dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
buf[len - 1] == '\n' ? "" : "\n");
nvdimm_bus_unlock(dev);
device_unlock(dev);
return rc ? rc : len;
}
static DEVICE_ATTR_RW(sector_size);
static ssize_t uuid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
if (nd_btt->uuid)
return sprintf(buf, "%pUb\n", nd_btt->uuid);
return sprintf(buf, "\n");
}
static ssize_t uuid_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
ssize_t rc;
device_lock(dev);
rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
buf[len - 1] == '\n' ? "" : "\n");
device_unlock(dev);
return rc ? rc : len;
}
static DEVICE_ATTR_RW(uuid);
static ssize_t namespace_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
ssize_t rc;
nvdimm_bus_lock(dev);
rc = sprintf(buf, "%s\n", nd_btt->ndns
? dev_name(&nd_btt->ndns->dev) : "");
nvdimm_bus_unlock(dev);
return rc;
}
static ssize_t namespace_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
ssize_t rc;
device_lock(dev);
nvdimm_bus_lock(dev);
rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len);
dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
buf[len - 1] == '\n' ? "" : "\n");
nvdimm_bus_unlock(dev);
device_unlock(dev);
return rc;
}
static DEVICE_ATTR_RW(namespace);
static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_btt *nd_btt = to_nd_btt(dev);
ssize_t rc;
device_lock(dev);
if (dev->driver)
rc = sprintf(buf, "%llu\n", nd_btt->size);
else {
/* no size to convey if the btt instance is disabled */
rc = -ENXIO;
}
device_unlock(dev);
return rc;
}
static DEVICE_ATTR_RO(size);
libnvdimm/btt: Fix LBA masking during 'free list' population The Linux BTT implementation assumes that log entries will never have the 'zero' flag set, and indeed it never sets that flag for log entries itself. However, the UEFI spec is ambiguous on the exact format of the LBA field of a log entry, specifically as to whether it should include the additional flag bits or not. While a zero bit doesn't make sense in the context of a log entry, other BTT implementations might still have it set. If an implementation does happen to have it set, we would happily read it in as the next block to write to for writes. Since a high bit is set, it pushes the block number out of the range of an 'arena', and we fail such a write with an EIO. Follow the robustness principle, and tolerate such implementations by stripping out the zero flag when populating the free list during initialization. Additionally, use the same stripped out entries for detection of incomplete writes and map restoration that happens at this stage. Add a sysfs file 'log_zero_flags' that indicates the ability to accept such a layout to userspace applications. This enables 'ndctl check-namespace' to recognize whether the kernel is able to handle zero flags, or whether it should attempt a fix-up under the --repair option. Cc: Dan Williams <dan.j.williams@intel.com> Reported-by: Dexuan Cui <decui@microsoft.com> Reported-by: Pedro d'Aquino Filocre F S Barbuda <pbarbuda@microsoft.com> Tested-by: Dexuan Cui <decui@microsoft.com> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2019-02-27 17:06:27 -07:00
static ssize_t log_zero_flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "Y\n");
}
static DEVICE_ATTR_RO(log_zero_flags);
static struct attribute *nd_btt_attributes[] = {
&dev_attr_sector_size.attr,
&dev_attr_namespace.attr,
&dev_attr_uuid.attr,
&dev_attr_size.attr,
libnvdimm/btt: Fix LBA masking during 'free list' population The Linux BTT implementation assumes that log entries will never have the 'zero' flag set, and indeed it never sets that flag for log entries itself. However, the UEFI spec is ambiguous on the exact format of the LBA field of a log entry, specifically as to whether it should include the additional flag bits or not. While a zero bit doesn't make sense in the context of a log entry, other BTT implementations might still have it set. If an implementation does happen to have it set, we would happily read it in as the next block to write to for writes. Since a high bit is set, it pushes the block number out of the range of an 'arena', and we fail such a write with an EIO. Follow the robustness principle, and tolerate such implementations by stripping out the zero flag when populating the free list during initialization. Additionally, use the same stripped out entries for detection of incomplete writes and map restoration that happens at this stage. Add a sysfs file 'log_zero_flags' that indicates the ability to accept such a layout to userspace applications. This enables 'ndctl check-namespace' to recognize whether the kernel is able to handle zero flags, or whether it should attempt a fix-up under the --repair option. Cc: Dan Williams <dan.j.williams@intel.com> Reported-by: Dexuan Cui <decui@microsoft.com> Reported-by: Pedro d'Aquino Filocre F S Barbuda <pbarbuda@microsoft.com> Tested-by: Dexuan Cui <decui@microsoft.com> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2019-02-27 17:06:27 -07:00
&dev_attr_log_zero_flags.attr,
NULL,
};
static struct attribute_group nd_btt_attribute_group = {
.attrs = nd_btt_attributes,
};
static const struct attribute_group *nd_btt_attribute_groups[] = {
&nd_btt_attribute_group,
&nd_device_attribute_group,
&nd_numa_attribute_group,
NULL,
};
static const struct device_type nd_btt_device_type = {
.name = "nd_btt",
.release = nd_btt_release,
.groups = nd_btt_attribute_groups,
};
bool is_nd_btt(struct device *dev)
{
return dev->type == &nd_btt_device_type;
}
EXPORT_SYMBOL(is_nd_btt);
static struct lock_class_key nvdimm_btt_key;
static struct device *__nd_btt_create(struct nd_region *nd_region,
unsigned long lbasize, uuid_t *uuid,
struct nd_namespace_common *ndns)
{
struct nd_btt *nd_btt;
struct device *dev;
nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
if (!nd_btt)
return NULL;
nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
if (nd_btt->id < 0)
goto out_nd_btt;
nd_btt->lbasize = lbasize;
if (uuid) {
uuid = kmemdup(uuid, 16, GFP_KERNEL);
if (!uuid)
goto out_put_id;
}
nd_btt->uuid = uuid;
dev = &nd_btt->dev;
dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
dev->parent = &nd_region->dev;
dev->type = &nd_btt_device_type;
device_initialize(&nd_btt->dev);
lockdep_set_class(&nd_btt->dev.mutex, &nvdimm_btt_key);
if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) {
dev_dbg(&ndns->dev, "failed, already claimed by %s\n",
dev_name(ndns->claim));
put_device(dev);
return NULL;
}
return dev;
out_put_id:
ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
out_nd_btt:
kfree(nd_btt);
return NULL;
}
struct device *nd_btt_create(struct nd_region *nd_region)
{
struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
nd_device_register(dev);
return dev;
}
/**
* nd_btt_arena_is_valid - check if the metadata layout is valid
* @nd_btt: device with BTT geometry and backing device info
* @super: pointer to the arena's info block being tested
*
* Check consistency of the btt info block with itself by validating
* the checksum, and with the parent namespace by verifying the
* parent_uuid contained in the info block with the one supplied in.
*
* Returns:
* false for an invalid info block, true for a valid one
*/
bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super)
{
const uuid_t *ns_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
uuid_t parent_uuid;
u64 checksum;
if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0)
return false;
import_uuid(&parent_uuid, super->parent_uuid);
if (!uuid_is_null(&parent_uuid))
if (!uuid_equal(&parent_uuid, ns_uuid))
return false;
checksum = le64_to_cpu(super->checksum);
super->checksum = 0;
if (checksum != nd_sb_checksum((struct nd_gen_sb *) super))
return false;
super->checksum = cpu_to_le64(checksum);
/* TODO: figure out action for this */
if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
dev_info(&nd_btt->dev, "Found arena with an error flag\n");
return true;
}
EXPORT_SYMBOL(nd_btt_arena_is_valid);
int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
struct btt_sb *btt_sb)
{
if (ndns->claim_class == NVDIMM_CCLASS_BTT2) {
/* Probe/setup for BTT v2.0 */
nd_btt->initial_offset = 0;
nd_btt->version_major = 2;
nd_btt->version_minor = 0;
if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0))
return -ENXIO;
if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
return -ENODEV;
if ((le16_to_cpu(btt_sb->version_major) != 2) ||
(le16_to_cpu(btt_sb->version_minor) != 0))
return -ENODEV;
} else {
/*
* Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or
* NVDIMM_CCLASS_BTT)
*/
nd_btt->initial_offset = SZ_4K;
nd_btt->version_major = 1;
nd_btt->version_minor = 1;
if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
return -ENXIO;
if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
return -ENODEV;
if ((le16_to_cpu(btt_sb->version_major) != 1) ||
(le16_to_cpu(btt_sb->version_minor) != 1))
return -ENODEV;
}
return 0;
}
EXPORT_SYMBOL(nd_btt_version);
static int __nd_btt_probe(struct nd_btt *nd_btt,
struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
{
int rc;
if (!btt_sb || !ndns || !nd_btt)
return -ENODEV;
if (nvdimm_namespace_capacity(ndns) < SZ_16M)
return -ENXIO;
rc = nd_btt_version(nd_btt, ndns, btt_sb);
if (rc < 0)
return rc;
nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
nd_btt->uuid = kmemdup(&btt_sb->uuid, sizeof(uuid_t), GFP_KERNEL);
if (!nd_btt->uuid)
return -ENOMEM;
nd_device_register(&nd_btt->dev);
return 0;
}
int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
{
int rc;
struct device *btt_dev;
struct btt_sb *btt_sb;
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
if (ndns->force_raw)
return -ENODEV;
switch (ndns->claim_class) {
case NVDIMM_CCLASS_NONE:
case NVDIMM_CCLASS_BTT:
case NVDIMM_CCLASS_BTT2:
break;
default:
return -ENODEV;
}
nvdimm_bus_lock(&ndns->dev);
btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns);
nvdimm_bus_unlock(&ndns->dev);
if (!btt_dev)
return -ENOMEM;
btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
dev_dbg(dev, "btt: %s\n", rc == 0 ? dev_name(btt_dev) : "<none>");
if (rc < 0) {
struct nd_btt *nd_btt = to_nd_btt(btt_dev);
libnvdimm: fix nvdimm_bus_lock() vs device_lock() ordering A debug patch to turn the standard device_lock() into something that lockdep can analyze yielded the following: ====================================================== [ INFO: possible circular locking dependency detected ] 4.11.0-rc4+ #106 Tainted: G O ------------------------------------------------------- lt-libndctl/1898 is trying to acquire lock: (&dev->nvdimm_mutex/3){+.+.+.}, at: [<ffffffffc023c948>] nd_attach_ndns+0x178/0x1b0 [libnvdimm] but task is already holding lock: (&nvdimm_bus->reconfig_mutex){+.+.+.}, at: [<ffffffffc022e0b1>] nvdimm_bus_lock+0x21/0x30 [libnvdimm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&nvdimm_bus->reconfig_mutex){+.+.+.}: lock_acquire+0xf6/0x1f0 __mutex_lock+0x88/0x980 mutex_lock_nested+0x1b/0x20 nvdimm_bus_lock+0x21/0x30 [libnvdimm] nvdimm_namespace_capacity+0x1b/0x40 [libnvdimm] nvdimm_namespace_common_probe+0x230/0x510 [libnvdimm] nd_pmem_probe+0x14/0x180 [nd_pmem] nvdimm_bus_probe+0xa9/0x260 [libnvdimm] -> #0 (&dev->nvdimm_mutex/3){+.+.+.}: __lock_acquire+0x1107/0x1280 lock_acquire+0xf6/0x1f0 __mutex_lock+0x88/0x980 mutex_lock_nested+0x1b/0x20 nd_attach_ndns+0x178/0x1b0 [libnvdimm] nd_namespace_store+0x308/0x3c0 [libnvdimm] namespace_store+0x87/0x220 [libnvdimm] In this case '&dev->nvdimm_mutex/3' mirrors '&dev->mutex'. Fix this by replacing the use of device_lock() with nvdimm_bus_lock() to protect nd_{attach,detach}_ndns() operations. Cc: <stable@vger.kernel.org> Fixes: 8c2f7e8658df ("libnvdimm: infrastructure for btt devices") Reported-by: Yi Zhang <yizhan@redhat.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2017-04-28 22:05:14 -07:00
nd_detach_ndns(btt_dev, &nd_btt->ndns);
put_device(btt_dev);
}
return rc;
}
EXPORT_SYMBOL(nd_btt_probe);