From 6c7413c0f5ab61d2339cf516d25bb44d71f4bad4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 23 Jul 2018 09:46:45 -0400 Subject: [PATCH 01/26] dm thin: update stale "Status" Documentation Documentation/device-mapper-/thin-provisioning.txt's "Status" section no longer reflected the current fitness level of DM thin-provisioning. That is, DM thinp is no longer "EXPERIMENTAL". It has since seen considerable improvement, has been fairly widely deployed and has performed in a robust manner. Update Documentation to dispel concern raised by potential DM thinp users. Reported-by: Drew Hastings Signed-off-by: Mike Snitzer --- Documentation/device-mapper/thin-provisioning.txt | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 3d01948ea061..b8a57b9cec19 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -28,17 +28,18 @@ administrator some freedom, for example to: Status ====== -These targets are very much still in the EXPERIMENTAL state. Please -do not yet rely on them in production. But do experiment and offer us -feedback. Different use cases will have different performance -characteristics, for example due to fragmentation of the data volume. +These targets are considered safe for production use. But different use +cases will have different performance characteristics, for example due +to fragmentation of the data volume. If you find this software is not performing as expected please mail dm-devel@redhat.com with details and we'll try our best to improve things for you. -Userspace tools for checking and repairing the metadata are under -development. +Userspace tools for checking and repairing the metadata have been fully +developed and are available as 'thin_check' and 'thin_repair'. The name +of the package that provides these utilities varies by distribution (on +a Red Hat distribution it is named 'device-mapper-persistent-data'). Cookbook ======== From af9313c32c0fa2a0ac3b113669273833d60cc9de Mon Sep 17 00:00:00 2001 From: John Pittman Date: Thu, 21 Jun 2018 17:35:33 -0400 Subject: [PATCH 02/26] dm cache: only allow a single io_mode cache feature to be requested More than one io_mode feature can be requested when creating a dm cache device (as is: last one wins). The io_mode selections are incompatible with one another, we should force them to be selected exclusively. Add a counter to check for more than one io_mode selection. Fixes: 629d0a8a1a10 ("dm cache metadata: add "metadata2" feature") Signed-off-by: John Pittman Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ce14a3d1f609..44df244807e5 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2250,7 +2250,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, {0, 2, "Invalid number of cache feature arguments"}, }; - int r; + int r, mode_ctr = 0; unsigned argc; const char *arg; struct cache_features *cf = &ca->features; @@ -2264,14 +2264,20 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, while (argc--) { arg = dm_shift_arg(as); - if (!strcasecmp(arg, "writeback")) + if (!strcasecmp(arg, "writeback")) { cf->io_mode = CM_IO_WRITEBACK; + mode_ctr++; + } - else if (!strcasecmp(arg, "writethrough")) + else if (!strcasecmp(arg, "writethrough")) { cf->io_mode = CM_IO_WRITETHROUGH; + mode_ctr++; + } - else if (!strcasecmp(arg, "passthrough")) + else if (!strcasecmp(arg, "passthrough")) { cf->io_mode = CM_IO_PASSTHROUGH; + mode_ctr++; + } else if (!strcasecmp(arg, "metadata2")) cf->metadata_version = 2; @@ -2282,6 +2288,11 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, } } + if (mode_ctr > 1) { + *error = "Duplicate cache io_mode features requested"; + return -EINVAL; + } + return 0; } From 3876ac76f02ac2a7b3d96f813c0ee1070e7a8c8e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 17 Apr 2018 00:33:13 +0200 Subject: [PATCH 03/26] dm delay: refactor repetitive code dm-delay has a lot of code that is repeated for delaying read and write bios. Repetitive code is generally bad; refactor out the repetitive code in preperation for adding another delay class for flush bios. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 237 ++++++++++++++++++++---------------------- 1 file changed, 110 insertions(+), 127 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 1783d80c9cad..c5ebe56bc28b 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -17,6 +17,13 @@ #define DM_MSG_PREFIX "delay" +struct delay_class { + struct dm_dev *dev; + sector_t start; + unsigned delay; + unsigned ops; +}; + struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; @@ -25,19 +32,15 @@ struct delay_c { struct list_head delayed_bios; atomic_t may_delay; - struct dm_dev *dev_read; - sector_t start_read; - unsigned read_delay; - unsigned reads; + struct delay_class read; + struct delay_class write; - struct dm_dev *dev_write; - sector_t start_write; - unsigned write_delay; - unsigned writes; + int argc; }; struct dm_delay_info { struct delay_c *context; + struct delay_class *class; struct list_head list; unsigned long expires; }; @@ -77,7 +80,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) { struct dm_delay_info *delayed, *next; unsigned long next_expires = 0; - int start_timer = 0; + unsigned long start_timer = 0; struct bio_list flush_bios = { }; mutex_lock(&delayed_bios_lock); @@ -87,10 +90,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) sizeof(struct dm_delay_info)); list_del(&delayed->list); bio_list_add(&flush_bios, bio); - if ((bio_data_dir(bio) == WRITE)) - delayed->context->writes--; - else - delayed->context->reads--; + delayed->class->ops--; continue; } @@ -100,7 +100,6 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) } else next_expires = min(next_expires, delayed->expires); } - mutex_unlock(&delayed_bios_lock); if (start_timer) @@ -117,6 +116,48 @@ static void flush_expired_bios(struct work_struct *work) flush_bios(flush_delayed_bios(dc, 0)); } +static void delay_dtr(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + destroy_workqueue(dc->kdelayd_wq); + + if (dc->read.dev) + dm_put_device(ti, dc->read.dev); + if (dc->write.dev) + dm_put_device(ti, dc->write.dev); + + mutex_destroy(&dc->timer_lock); + + kfree(dc); +} + +static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) +{ + int ret; + unsigned long long tmpll; + char dummy; + + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + return -EINVAL; + } + c->start = tmpll; + + if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) { + ti->error = "Invalid delay"; + return -EINVAL; + } + + ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev); + if (ret) { + ti->error = "Device lookup failed"; + return ret; + } + + return 0; +} + /* * Mapping parameters: * [ ] @@ -128,8 +169,6 @@ static void flush_expired_bios(struct work_struct *work) static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; - unsigned long long tmpll; - char dummy; int ret; if (argc != 3 && argc != 6) { @@ -137,125 +176,69 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - dc = kmalloc(sizeof(*dc), GFP_KERNEL); + dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) { ti->error = "Cannot allocate context"; return -ENOMEM; } - dc->reads = dc->writes = 0; - - ret = -EINVAL; - if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid device sector"; - goto bad; - } - dc->start_read = tmpll; - - if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { - ti->error = "Invalid delay"; - goto bad; - } - - ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), - &dc->dev_read); - if (ret) { - ti->error = "Device lookup failed"; - goto bad; - } - - ret = -EINVAL; - dc->dev_write = NULL; - if (argc == 3) - goto out; - - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { - ti->error = "Invalid write device sector"; - goto bad_dev_read; - } - dc->start_write = tmpll; - - if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { - ti->error = "Invalid write delay"; - goto bad_dev_read; - } - - ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), - &dc->dev_write); - if (ret) { - ti->error = "Write device lookup failed"; - goto bad_dev_read; - } - -out: - ret = -EINVAL; - dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!dc->kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - + ti->private = dc; timer_setup(&dc->delay_timer, handle_delayed_timer, 0); - INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); INIT_LIST_HEAD(&dc->delayed_bios); mutex_init(&dc->timer_lock); atomic_set(&dc->may_delay, 1); + dc->argc = argc; + + ret = delay_class_ctr(ti, &dc->read, argv); + if (ret) + goto bad; + + if (argc == 3) { + ret = delay_class_ctr(ti, &dc->write, argv); + if (ret) + goto bad; + goto out; + } + + ret = delay_class_ctr(ti, &dc->write, argv + 3); + if (ret) + goto bad; + +out: + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + ret = -EINVAL; + DMERR("Couldn't start kdelayd"); + goto bad; + } ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->per_io_data_size = sizeof(struct dm_delay_info); - ti->private = dc; return 0; -bad_queue: - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); -bad_dev_read: - dm_put_device(ti, dc->dev_read); bad: - kfree(dc); + delay_dtr(ti); return ret; } -static void delay_dtr(struct dm_target *ti) -{ - struct delay_c *dc = ti->private; - - destroy_workqueue(dc->kdelayd_wq); - - dm_put_device(ti, dc->dev_read); - - if (dc->dev_write) - dm_put_device(ti, dc->dev_write); - - mutex_destroy(&dc->timer_lock); - - kfree(dc); -} - -static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) +static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) { struct dm_delay_info *delayed; unsigned long expires = 0; - if (!delay || !atomic_read(&dc->may_delay)) + if (!c->delay || !atomic_read(&dc->may_delay)) return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); delayed->context = dc; - delayed->expires = expires = jiffies + msecs_to_jiffies(delay); + delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); mutex_lock(&delayed_bios_lock); - - if (bio_data_dir(bio) == WRITE) - dc->writes++; - else - dc->reads++; - + c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); - mutex_unlock(&delayed_bios_lock); queue_timeout(dc, expires); @@ -282,23 +265,25 @@ static void delay_resume(struct dm_target *ti) static int delay_map(struct dm_target *ti, struct bio *bio) { struct delay_c *dc = ti->private; + struct delay_class *c; + struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); - if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { - bio_set_dev(bio, dc->dev_write->bdev); - if (bio_sectors(bio)) - bio->bi_iter.bi_sector = dc->start_write + - dm_target_offset(ti, bio->bi_iter.bi_sector); - - return delay_bio(dc, dc->write_delay, bio); + if (bio_data_dir(bio) == WRITE) { + c = &dc->write; + } else { + c = &dc->read; } + delayed->class = c; + bio_set_dev(bio, c->dev->bdev); + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); - bio_set_dev(bio, dc->dev_read->bdev); - bio->bi_iter.bi_sector = dc->start_read + - dm_target_offset(ti, bio->bi_iter.bi_sector); - - return delay_bio(dc, dc->read_delay, bio); + return delay_bio(dc, c, bio); } +#define DMEMIT_DELAY_CLASS(c) \ + DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) + static void delay_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -307,17 +292,15 @@ static void delay_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u %u", dc->reads, dc->writes); + DMEMIT("%u %u", dc->read.ops, dc->write.ops); break; case STATUSTYPE_TABLE: - DMEMIT("%s %llu %u", dc->dev_read->name, - (unsigned long long) dc->start_read, - dc->read_delay); - if (dc->dev_write) - DMEMIT(" %s %llu %u", dc->dev_write->name, - (unsigned long long) dc->start_write, - dc->write_delay); + DMEMIT_DELAY_CLASS(&dc->read); + if (dc->argc >= 6) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->write); + } break; } } @@ -328,12 +311,12 @@ static int delay_iterate_devices(struct dm_target *ti, struct delay_c *dc = ti->private; int ret = 0; - ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); + ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); + if (ret) + goto out; + ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); if (ret) goto out; - - if (dc->dev_write) - ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); out: return ret; From cda6b5ab7f5935565ed5b9bbc385bd1d0a3feb75 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 17 Apr 2018 00:33:14 +0200 Subject: [PATCH 04/26] dm delay: add flush as a third class of IO Add a new class for dm-delay that delays flush requests. Previously, flushes were delayed as writes, but it caused problems if the user needed to create a device with one or a few slow sectors for the purpose of testing - all flushes would be forwarded to this device and delayed, and that skews the test results. Fix this by allowing to select 0 delay for flushes. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- Documentation/device-mapper/delay.txt | 3 ++- drivers/md/dm-delay.c | 34 +++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt index 4b1d22a44ce4..6426c45273cb 100644 --- a/Documentation/device-mapper/delay.txt +++ b/Documentation/device-mapper/delay.txt @@ -5,7 +5,8 @@ Device-Mapper's "delay" target delays reads and/or writes and maps them to different devices. Parameters: - [ ] + [ + [ ]] With separate write parameters, the first set is only used for reads. Offsets are specified in sectors. diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index c5ebe56bc28b..2fb7bb4304ad 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -34,6 +34,7 @@ struct delay_c { struct delay_class read; struct delay_class write; + struct delay_class flush; int argc; }; @@ -126,6 +127,8 @@ static void delay_dtr(struct dm_target *ti) dm_put_device(ti, dc->read.dev); if (dc->write.dev) dm_put_device(ti, dc->write.dev); + if (dc->flush.dev) + dm_put_device(ti, dc->flush.dev); mutex_destroy(&dc->timer_lock); @@ -171,8 +174,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct delay_c *dc; int ret; - if (argc != 3 && argc != 6) { - ti->error = "Requires exactly 3 or 6 arguments"; + if (argc != 3 && argc != 6 && argc != 9) { + ti->error = "Requires exactly 3, 6 or 9 arguments"; return -EINVAL; } @@ -196,6 +199,9 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (argc == 3) { ret = delay_class_ctr(ti, &dc->write, argv); + if (ret) + goto bad; + ret = delay_class_ctr(ti, &dc->flush, argv); if (ret) goto bad; goto out; @@ -204,6 +210,16 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->write, argv + 3); if (ret) goto bad; + if (argc == 6) { + ret = delay_class_ctr(ti, &dc->flush, argv + 3); + if (ret) + goto bad; + goto out; + } + + ret = delay_class_ctr(ti, &dc->flush, argv + 6); + if (ret) + goto bad; out: dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); @@ -269,7 +285,10 @@ static int delay_map(struct dm_target *ti, struct bio *bio) struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); if (bio_data_dir(bio) == WRITE) { - c = &dc->write; + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + c = &dc->flush; + else + c = &dc->write; } else { c = &dc->read; } @@ -292,7 +311,7 @@ static void delay_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%u %u", dc->read.ops, dc->write.ops); + DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops); break; case STATUSTYPE_TABLE: @@ -301,6 +320,10 @@ static void delay_status(struct dm_target *ti, status_type_t type, DMEMIT(" "); DMEMIT_DELAY_CLASS(&dc->write); } + if (dc->argc >= 9) { + DMEMIT(" "); + DMEMIT_DELAY_CLASS(&dc->flush); + } break; } } @@ -317,6 +340,9 @@ static int delay_iterate_devices(struct dm_target *ti, ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); if (ret) goto out; + ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); + if (ret) + goto out; out: return ret; From c21b16392701543d61e366dca84e15fe7f0cf0cf Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:25 +0200 Subject: [PATCH 05/26] dm integrity: change 'suspending' variable from bool to int Early alpha processors can't write a byte or short atomically - they read 8 bytes, modify the byte or two bytes in registers and write back 8 bytes. The modification of the variable "suspending" may race with modification of the variable "failed". Fix this by changing "suspending" to an int. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 86438b2f10dd..0a8a4c2aa3ea 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -178,7 +178,7 @@ struct dm_integrity_c { __u8 sectors_per_block; unsigned char mode; - bool suspending; + int suspending; int failed; @@ -2210,7 +2210,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) del_timer_sync(&ic->autocommit_timer); - ic->suspending = true; + WRITE_ONCE(ic->suspending, 1); queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); @@ -2220,7 +2220,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) dm_integrity_flush_buffers(ic); } - ic->suspending = false; + WRITE_ONCE(ic->suspending, 0); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); From 518748b1a744c496a657a5a7923e49e002a6f259 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:26 +0200 Subject: [PATCH 06/26] dm integrity: decouple common code in dm_integrity_map_continue() Decouple how dm_integrity_map_continue() responds to being out of free sectors and when add_new_range() fails. This has no functional change, but helps prepare for the next commit. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 0a8a4c2aa3ea..0829f18d91a3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1599,8 +1599,12 @@ retry: dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors << ic->sb->log2_sectors_per_block); - if (unlikely(!dio->range.n_sectors)) - goto sleep; + if (unlikely(!dio->range.n_sectors)) { + if (from_map) + goto offload_to_thread; + sleep_on_endio_wait(ic); + goto retry; + } range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; ic->free_sectors -= range_sectors; journal_section = ic->free_section; @@ -1660,8 +1664,8 @@ retry: * stall bios on current->bio_list. * So, we offload the bio to a workqueue if we have to sleep. */ -sleep: if (from_map) { +offload_to_thread: spin_unlock_irq(&ic->endio_wait.lock); INIT_WORK(&dio->work, integrity_bio_wait); queue_work(ic->wait_wq, &dio->work); From 724376a04d1a63d145fdfe7b24c0b13128a3ffd6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:27 +0200 Subject: [PATCH 07/26] dm integrity: implement fair range locks dm-integrity locks a range of sectors to prevent concurrent I/O or journal writeback. These locks were not fair - so that many small overlapping I/Os could starve a large I/O indefinitely. Fix this by making the range locks fair. The ranges that are waiting are added to the list "wait_list". If a new I/O overlaps some of the waiting I/Os, it is not dispatched, but it is also added to that wait list. Entries on the wait list are processed in first-in-first-out order, so that an I/O can't starve indefinitely. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 68 +++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 0829f18d91a3..510665253820 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -186,6 +186,7 @@ struct dm_integrity_c { /* these variables are locked with endio_wait.lock */ struct rb_root in_progress; + struct list_head wait_list; wait_queue_head_t endio_wait; struct workqueue_struct *wait_wq; @@ -233,7 +234,14 @@ struct dm_integrity_c { struct dm_integrity_range { sector_t logical_sector; unsigned n_sectors; - struct rb_node node; + bool waiting; + union { + struct rb_node node; + struct { + struct task_struct *task; + struct list_head wait_entry; + }; + }; }; struct dm_integrity_io { @@ -867,13 +875,27 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig } } -static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2) +{ + return range1->logical_sector < range2->logical_sector + range2->n_sectors && + range2->logical_sector + range2->n_sectors > range2->logical_sector; +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting) { struct rb_node **n = &ic->in_progress.rb_node; struct rb_node *parent; BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); + if (likely(check_waiting)) { + struct dm_integrity_range *range; + list_for_each_entry(range, &ic->wait_list, wait_entry) { + if (unlikely(ranges_overlap(range, new_range))) + return false; + } + } + parent = NULL; while (*n) { @@ -898,7 +920,22 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range * static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) { rb_erase(&range->node, &ic->in_progress); - wake_up_locked(&ic->endio_wait); + while (unlikely(!list_empty(&ic->wait_list))) { + struct dm_integrity_range *last_range = + list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry); + struct task_struct *last_range_task; + if (!ranges_overlap(range, last_range)) + break; + last_range_task = last_range->task; + list_del(&last_range->wait_entry); + if (!add_new_range(ic, last_range, false)) { + last_range->task = last_range_task; + list_add(&last_range->wait_entry, &ic->wait_list); + break; + } + last_range->waiting = false; + wake_up_process(last_range_task); + } } static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) @@ -910,6 +947,19 @@ static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *r spin_unlock_irqrestore(&ic->endio_wait.lock, flags); } +static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + new_range->waiting = true; + list_add_tail(&new_range->wait_entry, &ic->wait_list); + new_range->task = current; + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + } while (unlikely(new_range->waiting)); +} + static void init_journal_node(struct journal_node *node) { RB_CLEAR_NODE(&node->node); @@ -1658,7 +1708,7 @@ retry: } } } - if (unlikely(!add_new_range(ic, &dio->range))) { + if (unlikely(!add_new_range(ic, &dio->range, true))) { /* * We must not sleep in the request routine because it could * stall bios on current->bio_list. @@ -1670,10 +1720,8 @@ offload_to_thread: INIT_WORK(&dio->work, integrity_bio_wait); queue_work(ic->wait_wq, &dio->work); return; - } else { - sleep_on_endio_wait(ic); - goto retry; } + wait_and_add_new_range(ic, &dio->range); } spin_unlock_irq(&ic->endio_wait.lock); @@ -1896,8 +1944,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; spin_lock_irq(&ic->endio_wait.lock); - while (unlikely(!add_new_range(ic, &io->range))) - sleep_on_endio_wait(ic); + if (unlikely(!add_new_range(ic, &io->range, true))) + wait_and_add_new_range(ic, &io->range); if (likely(!from_replay)) { struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; @@ -2852,6 +2900,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_io_data_size = sizeof(struct dm_integrity_io); ic->in_progress = RB_ROOT; + INIT_LIST_HEAD(&ic->wait_list); init_waitqueue_head(&ic->endio_wait); bio_list_init(&ic->flush_bio_list); init_waitqueue_head(&ic->copy_to_journal_wait); @@ -3196,6 +3245,7 @@ static void dm_integrity_dtr(struct dm_target *ti) struct dm_integrity_c *ic = ti->private; BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); + BUG_ON(!list_empty(&ic->wait_list)); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); From f84fd2c98480fa0e3c0b43996c4e235bdf4a9527 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:28 +0200 Subject: [PATCH 08/26] dm integrity: report provided data sectors in the status Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 510665253820..09dadb771a62 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2295,7 +2295,9 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches)); + DMEMIT("%llu %llu", + (unsigned long long)atomic64_read(&ic->number_of_mismatches), + (unsigned long long)ic->provided_data_sectors); break; case STATUSTYPE_TABLE: { From 71e9ddbcb99e9bf1f968906b395e172ea8328365 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:29 +0200 Subject: [PATCH 09/26] dm integrity: add ic->start in get_data_sector() A small refactoring. Add the variable ic->start to the result returned by get_data_sector() and not in the callers. This is a prerequisite for the commit that adds the ability to use an external metadata device. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 09dadb771a62..39d465e92f74 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -394,6 +394,8 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector result += (area + 1) * ic->metadata_run; result += (sector_t)ic->initial_sectors + offset; + result += ic->start; + return result; } @@ -865,7 +867,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig io_req.notify.context = data; io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; - io_loc.sector = ic->start + target; + io_loc.sector = target; io_loc.count = n_sectors; r = dm_io(&io_req, 1, &io_loc, NULL); @@ -1753,7 +1755,6 @@ offload_to_thread: bio->bi_end_io = integrity_end_io; bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; - bio->bi_iter.bi_sector += ic->start; generic_make_request(bio); if (need_sync_io) { @@ -2391,7 +2392,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic) get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); last_sector = get_data_sector(ic, last_area, last_offset); - if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors) + if (last_sector < ic->start || last_sector >= ic->device_sectors) return -EINVAL; return 0; From 356d9d52e1221ba0c9f10b8b38652f78a5298329 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:30 +0200 Subject: [PATCH 10/26] dm integrity: allow separate metadata device Add the ability to store DM integrity metadata on a separate device. This feature is activated with the option "meta_device:/dev/device". Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 201 ++++++++++++++++++++++++++++---------- 1 file changed, 148 insertions(+), 53 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 39d465e92f74..fb5c8ef5b519 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -139,6 +139,7 @@ struct alg_spec { struct dm_integrity_c { struct dm_dev *dev; + struct dm_dev *meta_dev; unsigned tag_size; __s8 log2_tag_size; sector_t start; @@ -170,7 +171,8 @@ struct dm_integrity_c { unsigned short journal_section_sectors; unsigned journal_sections; unsigned journal_entries; - sector_t device_sectors; + sector_t data_device_sectors; + sector_t meta_device_sectors; unsigned initial_sectors; unsigned metadata_run; __s8 log2_metadata_run; @@ -345,10 +347,14 @@ static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, sector_t *area, sector_t *offset) { - __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; - - *area = data_sector >> log2_interleave_sectors; - *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); + if (!ic->meta_dev) { + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); + } else { + *area = 0; + *offset = data_sector; + } } #define sector_to_block(ic, n) \ @@ -387,6 +393,9 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector { sector_t result; + if (ic->meta_dev) + return offset; + result = area << ic->sb->log2_interleave_sectors; if (likely(ic->log2_metadata_run >= 0)) result += (area + 1) << ic->log2_metadata_run; @@ -416,7 +425,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) io_req.mem.ptr.addr = ic->sb; io_req.notify.fn = NULL; io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; io_loc.sector = ic->start; io_loc.count = SB_SECTORS; @@ -763,7 +772,7 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned io_req.notify.fn = NULL; } io_req.client = ic->io; - io_loc.bdev = ic->dev->bdev; + io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev; io_loc.sector = ic->start + SB_SECTORS + sector; io_loc.count = n_sectors; @@ -2306,12 +2315,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); arg_count = 5; + arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); + if (ic->meta_dev) + DMEMIT(" meta_device:%s", ic->meta_dev->name); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); @@ -2341,7 +2353,10 @@ static int dm_integrity_iterate_devices(struct dm_target *ti, { struct dm_integrity_c *ic = ti->private; - return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); + if (!ic->meta_dev) + return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); + else + return fn(ti, ic->dev, 0, ti->len, data); } static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2374,26 +2389,38 @@ static void calculate_journal_section_size(struct dm_integrity_c *ic) static int calculate_device_limits(struct dm_integrity_c *ic) { __u64 initial_sectors; - sector_t last_sector, last_area, last_offset; calculate_journal_section_size(ic); initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; - if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX) + if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX) return -EINVAL; ic->initial_sectors = initial_sectors; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; - if (!(ic->metadata_run & (ic->metadata_run - 1))) - ic->log2_metadata_run = __ffs(ic->metadata_run); - else - ic->log2_metadata_run = -1; + if (!ic->meta_dev) { + sector_t last_sector, last_area, last_offset; - get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); - last_sector = get_data_sector(ic, last_area, last_offset); + ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + if (!(ic->metadata_run & (ic->metadata_run - 1))) + ic->log2_metadata_run = __ffs(ic->metadata_run); + else + ic->log2_metadata_run = -1; - if (last_sector < ic->start || last_sector >= ic->device_sectors) - return -EINVAL; + get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); + last_sector = get_data_sector(ic, last_area, last_offset); + if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) + return -EINVAL; + } else { + __u64 meta_size = ic->provided_data_sectors * ic->tag_size; + meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) + >> (ic->log2_buffer_sectors + SECTOR_SHIFT); + meta_size <<= ic->log2_buffer_sectors; + if (ic->initial_sectors + meta_size < ic->initial_sectors || + ic->initial_sectors + meta_size > ic->meta_device_sectors) + return -EINVAL; + ic->metadata_run = 1; + ic->log2_metadata_run = 0; + } return 0; } @@ -2415,26 +2442,51 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = journal_sectors / ic->journal_section_sectors; if (!journal_sections) journal_sections = 1; - ic->sb->journal_sections = cpu_to_le32(journal_sections); - if (!interleave_sectors) - interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; - ic->sb->log2_interleave_sectors = __fls(interleave_sectors); - ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + if (!ic->meta_dev) { + ic->sb->journal_sections = cpu_to_le32(journal_sections); + if (!interleave_sectors) + interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; + ic->sb->log2_interleave_sectors = __fls(interleave_sectors); + ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); + ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->provided_data_sectors = 0; - for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) { - __u64 prev_data_sectors = ic->provided_data_sectors; + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; - ic->provided_data_sectors |= (sector_t)1 << test_bit; - if (calculate_device_limits(ic)) - ic->provided_data_sectors = prev_data_sectors; + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + if (!ic->provided_data_sectors) + return -EINVAL; + } else { + ic->sb->log2_interleave_sectors = 0; + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + +try_smaller_buffer: + ic->sb->journal_sections = cpu_to_le32(0); + for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) { + __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections); + __u32 test_journal_sections = prev_journal_sections | (1U << test_bit); + if (test_journal_sections > journal_sections) + continue; + ic->sb->journal_sections = cpu_to_le32(test_journal_sections); + if (calculate_device_limits(ic)) + ic->sb->journal_sections = cpu_to_le32(prev_journal_sections); + + } + if (!le32_to_cpu(ic->sb->journal_sections)) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + return -EINVAL; + } } - if (!ic->provided_data_sectors) - return -EINVAL; - ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); return 0; @@ -2939,9 +2991,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; - journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, - ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + journal_sectors = 0; interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; @@ -2964,7 +3014,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1) - journal_sectors = val; + journal_sectors = val ? val : 1; else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1) interleave_sectors = val; else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1) @@ -2973,7 +3023,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) journal_watermark = val; else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1) sync_msec = val; - else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { + else if (!memcmp(opt_string, "meta_device:", strlen("meta_device:"))) { + if (ic->meta_dev) { + dm_put_device(ti, ic->meta_dev); + ic->meta_dev = NULL; + } + r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); + if (r) { + ti->error = "Device lookup failed"; + goto bad; + } + } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { if (val < 1 << SECTOR_SHIFT || val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT || (val & (val -1))) { @@ -3004,6 +3064,21 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } + ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; + if (!ic->meta_dev) + ic->meta_device_sectors = ic->data_device_sectors; + else + ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT; + + if (!journal_sectors) { + journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + } + + if (!buffer_sectors) + buffer_sectors = 1; + ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); + r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, "Invalid internal hash", "Error setting internal hash key"); if (r) @@ -3139,11 +3214,19 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } /* make sure that ti->max_io_len doesn't overflow */ - if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || - ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { - r = -EINVAL; - ti->error = "Invalid interleave_sectors in the superblock"; - goto bad; + if (!ic->meta_dev) { + if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || + ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } + } else { + if (ic->sb->log2_interleave_sectors) { + r = -EINVAL; + ti->error = "Invalid interleave_sectors in the superblock"; + goto bad; + } } ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { @@ -3157,20 +3240,28 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Journal mac mismatch"; goto bad; } + +try_smaller_buffer: r = calculate_device_limits(ic); if (r) { + if (ic->meta_dev) { + if (ic->log2_buffer_sectors > 3) { + ic->log2_buffer_sectors--; + goto try_smaller_buffer; + } + } ti->error = "The device is too small"; goto bad; } + if (!ic->meta_dev) + ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); + if (ti->len > ic->provided_data_sectors) { r = -EINVAL; ti->error = "Not enough provided sectors for requested mapping size"; goto bad; } - if (!buffer_sectors) - buffer_sectors = 1; - ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT); threshold = (__u64)ic->journal_entries * (100 - journal_watermark); threshold += 50; @@ -3194,8 +3285,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); - ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), - 1, 0, NULL, NULL); + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, + 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL); if (IS_ERR(ic->bufio)) { r = PTR_ERR(ic->bufio); ti->error = "Cannot initialize dm-bufio"; @@ -3227,9 +3318,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->just_formatted = true; } - r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); - if (r) - goto bad; + if (!ic->meta_dev) { + r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); + if (r) + goto bad; + } if (!ic->internal_hash) dm_integrity_set(ti, ic); @@ -3265,6 +3358,8 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_io_client_destroy(ic->io); if (ic->dev) dm_put_device(ti, ic->dev); + if (ic->meta_dev) + dm_put_device(ti, ic->meta_dev); dm_integrity_free_page_list(ic, ic->journal); dm_integrity_free_page_list(ic, ic->journal_io); dm_integrity_free_page_list(ic, ic->journal_xor); From 1f9fc0b826119f8d76d33c3bf60b7426be6dc19e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:31 +0200 Subject: [PATCH 11/26] dm integrity: use version 2 for separate metadata Use version "2" in the superblock when data and metadata devices are separate, so that the device is not accidentally read by older kernel version. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index fb5c8ef5b519..1097d8c25577 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -44,7 +44,8 @@ */ #define SB_MAGIC "integrt" -#define SB_VERSION 1 +#define SB_VERSION_1 1 +#define SB_VERSION_2 2 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -414,6 +415,14 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) *sec_ptr -= ic->journal_sections; } +static void sb_set_version(struct dm_integrity_c *ic) +{ + if (ic->meta_dev) + ic->sb->version = SB_VERSION_2; + else + ic->sb->version = SB_VERSION_1; +} + static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) { struct dm_io_request io_req; @@ -2432,7 +2441,6 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); memcpy(ic->sb->magic, SB_MAGIC, 8); - ic->sb->version = SB_VERSION; ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); if (ic->journal_mac_alg.alg_string) @@ -2489,6 +2497,8 @@ try_smaller_buffer: ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + sb_set_version(ic); + return 0; } @@ -3193,7 +3203,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (ic->sb->version != SB_VERSION) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { r = -EINVAL; ti->error = "Unknown version"; goto bad; From 747829a8e6c6a65e096ce8dd79506cbcf83951ad Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:32 +0200 Subject: [PATCH 12/26] dm integrity: flush journal on suspend when using separate metadata device Flush the journal on suspend when using separate data and metadata devices, so that the metadata device can be discarded and the table can be reloaded with a linear target pointing to the data device. NOTE: the journal is deliberately not flushed when using the same device for metadata and data, so that the journal replay code is tested. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1097d8c25577..2d47519c54d7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2052,7 +2052,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (READ_ONCE(ic->suspending)) + if (READ_ONCE(ic->suspending) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2287,6 +2287,8 @@ static void dm_integrity_postsuspend(struct dm_target *ti) drain_workqueue(ic->commit_wq); if (ic->mode == 'J') { + if (ic->meta_dev) + queue_work(ic->writer_wq, &ic->writer_work); drain_workqueue(ic->writer_wq); dm_integrity_flush_buffers(ic); } From a3fcf7253139609bf9ff901fbf955fba047e75dd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 20:13:33 +0200 Subject: [PATCH 13/26] dm integrity: recalculate checksums on creation When using external metadata device and internal hash, recalculate the checksums when the device is created - so that dm-integrity doesn't have to overwrite the device. The superblock stores the last position when the recalculation ended, so that it is properly restarted. Integrity tags that haven't been recalculated yet are ignored. Also bump the target version. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-integrity.txt | 4 + drivers/md/dm-integrity.c | 187 ++++++++++++++++++- 2 files changed, 187 insertions(+), 4 deletions(-) diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt index f33e3ade7a09..297251b0d2d5 100644 --- a/Documentation/device-mapper/dm-integrity.txt +++ b/Documentation/device-mapper/dm-integrity.txt @@ -113,6 +113,10 @@ internal_hash:algorithm(:key) (the key is optional) from an upper layer target, such as dm-crypt. The upper layer target should check the validity of the integrity tags. +recalculate + Recalculate the integrity tags automatically. It is only valid + when using internal hash. + journal_crypt:algorithm(:key) (the key is optional) Encrypt the journal using given algorithm to make sure that the attacker can't read the journal. You can use a block cipher here diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2d47519c54d7..378878599466 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -31,6 +31,8 @@ #define MIN_LOG2_INTERLEAVE_SECTORS 3 #define MAX_LOG2_INTERLEAVE_SECTORS 31 #define METADATA_WORKQUEUE_MAX_ACTIVE 16 +#define RECALC_SECTORS 8192 +#define RECALC_WRITE_SUPER 16 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -58,9 +60,12 @@ struct superblock { __u64 provided_data_sectors; /* userspace uses this value */ __u32 flags; __u8 log2_sectors_per_block; + __u8 pad[3]; + __u64 recalc_sector; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 +#define SB_FLAG_RECALCULATING 0x2 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -214,6 +219,11 @@ struct dm_integrity_c { struct workqueue_struct *writer_wq; struct work_struct writer_work; + struct workqueue_struct *recalc_wq; + struct work_struct recalc_work; + u8 *recalc_buffer; + u8 *recalc_tags; + struct bio_list flush_bio_list; unsigned long autocommit_jiffies; @@ -417,7 +427,7 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->meta_dev) + if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; else ic->sb->version = SB_VERSION_1; @@ -1777,9 +1787,14 @@ offload_to_thread: if (need_sync_io) { wait_for_completion_io(&read_comp); + if (unlikely(ic->recalc_wq != NULL) && + ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) + goto skip_check; if (likely(!bio->bi_status)) integrity_metadata(&dio->work); else +skip_check: dec_in_flight(dio); } else { @@ -2079,6 +2094,108 @@ static void integrity_writer(struct work_struct *w) spin_unlock_irq(&ic->endio_wait.lock); } +static void recalc_write_super(struct dm_integrity_c *ic) +{ + int r; + + dm_integrity_flush_buffers(ic); + if (dm_integrity_failed(ic)) + return; + + sb_set_version(ic); + r = sync_rw_sb(ic, REQ_OP_WRITE, 0); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +} + +static void integrity_recalc(struct work_struct *w) +{ + struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work); + struct dm_integrity_range range; + struct dm_io_request io_req; + struct dm_io_region io_loc; + sector_t area, offset; + sector_t metadata_block; + unsigned metadata_offset; + __u8 *t; + unsigned i; + int r; + unsigned super_counter = 0; + + spin_lock_irq(&ic->endio_wait.lock); + +next_chunk: + + if (unlikely(READ_ONCE(ic->suspending))) + goto unlock_ret; + + range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + goto unlock_ret; + + get_area_and_offset(ic, range.logical_sector, &area, &offset); + range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); + if (!ic->meta_dev) + range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); + + if (unlikely(!add_new_range(ic, &range, true))) + wait_and_add_new_range(ic, &range); + + spin_unlock_irq(&ic->endio_wait.lock); + + if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { + recalc_write_super(ic); + super_counter = 0; + } + + if (unlikely(dm_integrity_failed(ic))) + goto err; + + io_req.bi_op = REQ_OP_READ; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = get_data_sector(ic, area, offset); + io_loc.count = range.n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, "reading data", r); + goto err; + } + + t = ic->recalc_tags; + for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + t += ic->tag_size; + } + + metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); + + r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE); + if (unlikely(r)) { + dm_integrity_io_error(ic, "writing tags", r); + goto err; + } + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); + goto next_chunk; + +err: + remove_range(ic, &range); + return; + +unlock_ret: + spin_unlock_irq(&ic->endio_wait.lock); + + recalc_write_super(ic); +} + static void init_journal(struct dm_integrity_c *ic, unsigned start_section, unsigned n_sections, unsigned char commit_seq) { @@ -2283,6 +2400,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti) WRITE_ONCE(ic->suspending, 1); + if (ic->recalc_wq) + drain_workqueue(ic->recalc_wq); + queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); @@ -2305,6 +2425,16 @@ static void dm_integrity_resume(struct dm_target *ti) struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; replay_journal(ic); + + if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + if (recalc_pos < ic->provided_data_sectors) { + queue_work(ic->recalc_wq, &ic->recalc_work); + } else if (recalc_pos > ic->provided_data_sectors) { + ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors); + recalc_write_super(ic); + } + } } static void dm_integrity_status(struct dm_target *ti, status_type_t type, @@ -2319,6 +2449,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT("%llu %llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches), (unsigned long long)ic->provided_data_sectors); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector)); + else + DMEMIT(" -"); break; case STATUSTYPE_TABLE: { @@ -2328,6 +2462,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count = 5; arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; + arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -2335,13 +2470,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) DMEMIT(" meta_device:%s", ic->meta_dev->name); + if (ic->sectors_per_block != 1) + DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); - if (ic->sectors_per_block != 1) - DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); #define EMIT_ALG(a, n) \ do { \ @@ -2947,6 +3084,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; + bool recalculate; bool should_write_sb; __u64 threshold; unsigned long long start; @@ -3008,6 +3146,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; sync_msec = DEFAULT_SYNC_MSEC; + recalculate = false; ic->sectors_per_block = 1; as.argc = argc - DIRECT_ARGUMENTS; @@ -3069,6 +3208,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) "Invalid journal_mac argument"); if (r) goto bad; + } else if (!strcmp(opt_string, "recalculate")) { + recalculate = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3297,6 +3438,38 @@ try_smaller_buffer: (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + if (!ic->internal_hash) { + r = -EINVAL; + ti->error = "Recalculate is only valid with internal hash"; + goto bad; + } + ic->recalc_wq = alloc_workqueue("dm-intergrity-recalc", WQ_MEM_RECLAIM, 1); + if (!ic->recalc_wq ) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + INIT_WORK(&ic->recalc_work, integrity_recalc); + ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); + if (!ic->recalc_buffer) { + ti->error = "Cannot allocate buffer for recalculating"; + r = -ENOMEM; + goto bad; + } + ic->recalc_tags = kvmalloc((RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size, GFP_KERNEL); + if (!ic->recalc_tags) { + ti->error = "Cannot allocate tags for recalculating"; + r = -ENOMEM; + goto bad; + } + } + ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL); if (IS_ERR(ic->bufio)) { @@ -3363,6 +3536,12 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->commit_wq); if (ic->writer_wq) destroy_workqueue(ic->writer_wq); + if (ic->recalc_wq) + destroy_workqueue(ic->recalc_wq); + if (ic->recalc_buffer) + vfree(ic->recalc_buffer); + if (ic->recalc_tags) + kvfree(ic->recalc_tags); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); @@ -3412,7 +3591,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, From c7329eff72aa237d6bedef6dc57c93dc048d2a16 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 11 Jul 2018 12:10:51 -0400 Subject: [PATCH 14/26] dm crypt: use wake_up_process() instead of a wait queue This is a small simplification of dm-crypt - use wake_up_process() instead of a wait queue in a case where only one process may be waiting. dm-writecache uses a similar pattern. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b61b069c33af..c406767cb9b7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -144,7 +144,7 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - wait_queue_head_t write_thread_wait; + spinlock_t write_thread_lock; struct task_struct *write_thread; struct rb_root write_tree; @@ -1620,36 +1620,31 @@ static int dmcrypt_write(void *data) struct rb_root write_tree; struct blk_plug plug; - DECLARE_WAITQUEUE(wait, current); - - spin_lock_irq(&cc->write_thread_wait.lock); + spin_lock_irq(&cc->write_thread_lock); continue_locked: if (!RB_EMPTY_ROOT(&cc->write_tree)) goto pop_from_list; set_current_state(TASK_INTERRUPTIBLE); - __add_wait_queue(&cc->write_thread_wait, &wait); - spin_unlock_irq(&cc->write_thread_wait.lock); + spin_unlock_irq(&cc->write_thread_lock); if (unlikely(kthread_should_stop())) { set_current_state(TASK_RUNNING); - remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); set_current_state(TASK_RUNNING); - spin_lock_irq(&cc->write_thread_wait.lock); - __remove_wait_queue(&cc->write_thread_wait, &wait); + spin_lock_irq(&cc->write_thread_lock); goto continue_locked; pop_from_list: write_tree = cc->write_tree; cc->write_tree = RB_ROOT; - spin_unlock_irq(&cc->write_thread_wait.lock); + spin_unlock_irq(&cc->write_thread_lock); BUG_ON(rb_parent(write_tree.rb_node)); @@ -1693,7 +1688,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) return; } - spin_lock_irqsave(&cc->write_thread_wait.lock, flags); + spin_lock_irqsave(&cc->write_thread_lock, flags); + if (RB_EMPTY_ROOT(&cc->write_tree)) + wake_up_process(cc->write_thread); rbp = &cc->write_tree.rb_node; parent = NULL; sector = io->sector; @@ -1706,9 +1703,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) } rb_link_node(&io->rb_node, parent, rbp); rb_insert_color(&io->rb_node, &cc->write_tree); - - wake_up_locked(&cc->write_thread_wait); - spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags); + spin_unlock_irqrestore(&cc->write_thread_lock, flags); } static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) @@ -2831,7 +2826,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - init_waitqueue_head(&cc->write_thread_wait); + spin_lock_init(&cc->write_thread_lock); cc->write_tree = RB_ROOT; cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); From c07c88f54f2323516e8038aa9301ab0db4812c81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 15 Jul 2018 20:59:12 -0700 Subject: [PATCH 15/26] dm crypt: convert essiv from ahash to shash In preparing to remove all stack VLA usage from the kernel[1], remove the discouraged use of AHASH_REQUEST_ON_STACK in favor of the smaller SHASH_DESC_ON_STACK by converting from ahash-wrapped-shash to direct shash. The stack allocation will be made a fixed size in a later patch to the crypto subsystem. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Signed-off-by: Kees Cook Reviewed-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c406767cb9b7..d412bd4b911c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -99,7 +99,7 @@ struct crypt_iv_operations { }; struct iv_essiv_private { - struct crypto_ahash *hash_tfm; + struct crypto_shash *hash_tfm; u8 *salt; }; @@ -327,25 +327,22 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_essiv_init(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm); - struct scatterlist sg; + SHASH_DESC_ON_STACK(desc, essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int err; - sg_init_one(&sg, cc->key, cc->key_size); - ahash_request_set_tfm(req, essiv->hash_tfm); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size); + desc->tfm = essiv->hash_tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_ahash_digest(req); - ahash_request_zero(req); + err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); + shash_desc_zero(desc); if (err) return err; essiv_tfm = cc->iv_private; err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_ahash_digestsize(essiv->hash_tfm)); + crypto_shash_digestsize(essiv->hash_tfm)); if (err) return err; @@ -356,7 +353,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm); + unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int r, err = 0; @@ -408,7 +405,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_ahash(essiv->hash_tfm); + crypto_free_shash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); @@ -426,7 +423,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { struct crypto_cipher *essiv_tfm = NULL; - struct crypto_ahash *hash_tfm = NULL; + struct crypto_shash *hash_tfm = NULL; u8 *salt = NULL; int err; @@ -436,14 +433,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC); + hash_tfm = crypto_alloc_shash(opts, 0, 0); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; err = PTR_ERR(hash_tfm); goto bad; } - salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL); + salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL); if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; err = -ENOMEM; @@ -454,7 +451,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, cc->iv_gen_private.essiv.hash_tfm = hash_tfm; essiv_tfm = alloc_essiv_cipher(cc, ti, salt, - crypto_ahash_digestsize(hash_tfm)); + crypto_shash_digestsize(hash_tfm)); if (IS_ERR(essiv_tfm)) { crypt_iv_essiv_dtr(cc); return PTR_ERR(essiv_tfm); @@ -465,7 +462,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, bad: if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_ahash(hash_tfm); + crypto_free_shash(hash_tfm); kfree(salt); return err; } From 9ff07e7d634cb005e7c5dc5e2c28a06508eb4fbf Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 25 Jul 2018 02:34:06 -0400 Subject: [PATCH 16/26] dm writecache: report start_sector in status line Fixes: d284f8248c7 ("dm writecache: support optional offset for start of device") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 87107c995cb5..e672e1d17bf1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2240,6 +2240,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; + if (wc->start_sector) + extra_args += 2; if (wc->high_wm_percent_set) extra_args += 2; if (wc->low_wm_percent_set) @@ -2254,6 +2256,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; DMEMIT("%u", extra_args); + if (wc->start_sector) + DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); if (wc->high_wm_percent_set) { x = (uint64_t)wc->freelist_high_watermark * 100; x += wc->n_blocks / 2; @@ -2280,7 +2284,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 1, 0}, + .version = {1, 1, 1}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, From 63c8ecb6261abcb79191a264778e8dae222e67cf Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 27 Jul 2018 15:51:57 -0700 Subject: [PATCH 17/26] dm thin: include metadata_low_watermark threshold in pool status The metadata low watermark threshold is set by the kernel. But the kernel depends on userspace to extend the thinpool metadata device when the threshold is crossed. Since the metadata low watermark threshold is not visible to userspace, upon receiving an event, userspace cannot tell that the kernel wants the metadata device extended, instead of some other eventing condition. Making it visible (but not settable) enables userspace to affirmatively know the kernel is asking for a metadata device extension, by comparing metadata_low_watermark against nr_free_blocks_metadata, also reported in status. Current solutions like dmeventd have their own thresholds for extending the data and metadata devices, and both devices are checked against their thresholds on each event. This lessens the value of the kernel-set threshold, since userspace will either extend the metadata device sooner, when receiving another event; or will receive the metadata lowater event and do nothing, if dmeventd's threshold is less than the kernel's. (This second case is dangerous. The metadata lowater event will not be re-sent, so no further event will be generated before the metadata device is out if space, unless some other event causes userspace to recheck its thresholds.) Signed-off-by: Andy Grover Signed-off-by: Mike Snitzer --- Documentation/device-mapper/thin-provisioning.txt | 7 ++++++- drivers/md/dm-thin.c | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index b8a57b9cec19..883e7ca5f745 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -281,7 +281,7 @@ ii) Status / / ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space - needs_check|- + needs_check|- metadata_low_watermark transaction id: A 64-bit number used by userspace to help synchronise with metadata @@ -328,6 +328,11 @@ ii) Status thin-pool can be made fully operational again. '-' indicates needs_check is not set. + metadata_low_watermark: + Value of metadata low watermark in blocks. The kernel sets this + value internally but userspace needs to know this value to + determine if an event was caused by crossing this threshold. + iii) Messages create_thin diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index b900723bbd0f..c44477d7a9ea 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3890,6 +3890,8 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("- "); + DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt)); + break; case STATUSTYPE_TABLE: @@ -3979,7 +3981,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4353,7 +4355,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, From 7209049d40dc37791ce0f3738965296f30e26044 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 31 Jul 2018 17:27:02 -0400 Subject: [PATCH 18/26] dm kcopyd: return void from dm_kcopyd_copy() dm_kcopyd_copy() only ever returns 0 so there is no need for callers to account for possible failure. Same goes for dm_kcopyd_zero(). Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 16 ++++------------ drivers/md/dm-kcopyd.c | 16 +++++++--------- drivers/md/dm-raid1.c | 17 +++++------------ drivers/md/dm-thin.c | 23 +++-------------------- drivers/md/dm-zoned-reclaim.c | 6 ++---- include/linux/dm-kcopyd.h | 12 ++++++------ 6 files changed, 27 insertions(+), 63 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 44df244807e5..a53413371725 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1188,9 +1188,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) queue_continuation(mg->cache->wq, &mg->k); } -static int copy(struct dm_cache_migration *mg, bool promote) +static void copy(struct dm_cache_migration *mg, bool promote) { - int r; struct dm_io_region o_region, c_region; struct cache *cache = mg->cache; @@ -1203,11 +1202,9 @@ static int copy(struct dm_cache_migration *mg, bool promote) c_region.count = cache->sectors_per_block; if (promote) - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); + dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); else - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); - - return r; + dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); } static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) @@ -1449,12 +1446,7 @@ static void mg_full_copy(struct work_struct *ws) } init_continuation(&mg->k, mg_upgrade_lock); - - if (copy(mg, is_policy_promote)) { - DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = BLK_STS_IOERR; - mg_complete(mg, false); - } + copy(mg, is_policy_promote); } static void mg_copy(struct work_struct *ws) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3c7547a3c371..cc101f3ec42c 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -741,9 +741,9 @@ static void split_job(struct kcopyd_job *master_job) } } -int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, - unsigned int num_dests, struct dm_io_region *dests, - unsigned int flags, dm_kcopyd_notify_fn fn, void *context) +void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned int num_dests, struct dm_io_region *dests, + unsigned int flags, dm_kcopyd_notify_fn fn, void *context) { struct kcopyd_job *job; int i; @@ -818,16 +818,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->progress = 0; split_job(job); } - - return 0; } EXPORT_SYMBOL(dm_kcopyd_copy); -int dm_kcopyd_zero(struct dm_kcopyd_client *kc, - unsigned num_dests, struct dm_io_region *dests, - unsigned flags, dm_kcopyd_notify_fn fn, void *context) +void dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context) { - return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); + dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); } EXPORT_SYMBOL(dm_kcopyd_zero); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 5903e492bb34..79eab1071ec2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -326,9 +326,8 @@ static void recovery_complete(int read_err, unsigned long write_err, dm_rh_recovery_end(reg, !(read_err || write_err)); } -static int recover(struct mirror_set *ms, struct dm_region *reg) +static void recover(struct mirror_set *ms, struct dm_region *reg) { - int r; unsigned i; struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; struct mirror *m; @@ -367,10 +366,8 @@ static int recover(struct mirror_set *ms, struct dm_region *reg) if (!errors_handled(ms)) set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); - r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, - flags, recovery_complete, reg); - - return r; + dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, + flags, recovery_complete, reg); } static void reset_ms_flags(struct mirror_set *ms) @@ -388,7 +385,6 @@ static void do_recovery(struct mirror_set *ms) { struct dm_region *reg; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - int r; /* * Start quiescing some regions. @@ -398,11 +394,8 @@ static void do_recovery(struct mirror_set *ms) /* * Copy any already quiesced regions. */ - while ((reg = dm_rh_recovery_start(ms->rh))) { - r = recover(ms, reg); - if (r) - dm_rh_recovery_end(reg, 0); - } + while ((reg = dm_rh_recovery_start(ms->rh))) + recover(ms, reg); /* * Update the in sync flag. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c44477d7a9ea..5997d6808b57 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1220,18 +1220,13 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, sector_t begin, sector_t end) { - int r; struct dm_io_region to; to.bdev = tc->pool_dev->bdev; to.sector = begin; to.count = end - begin; - r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - DMERR_LIMIT("dm_kcopyd_zero() failed"); - copy_complete(1, 1, m); - } + dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); } static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, @@ -1257,7 +1252,6 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_bio_prison_cell *cell, struct bio *bio, sector_t len) { - int r; struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); @@ -1296,19 +1290,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, to.sector = data_dest * pool->sectors_per_block; to.count = len; - r = dm_kcopyd_copy(pool->copier, &from, 1, &to, - 0, copy_complete, m); - if (r < 0) { - DMERR_LIMIT("dm_kcopyd_copy() failed"); - copy_complete(1, 1, m); - - /* - * We allow the zero to be issued, to simplify the - * error path. Otherwise we'd need to start - * worrying about decrementing the prepare_actions - * counter. - */ - } + dm_kcopyd_copy(pool->copier, &from, 1, &to, + 0, copy_complete, m); /* * Do we need to zero a tail region? diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 44a119e12f1a..edf4b95eb075 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -161,10 +161,8 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, /* Copy the valid region */ set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags); - ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, - dmz_reclaim_kcopy_end, zrc); - if (ret) - return ret; + dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, + dmz_reclaim_kcopy_end, zrc); /* Wait for copy to complete */ wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY, diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h index cfac8588ed56..e42de7750c88 100644 --- a/include/linux/dm-kcopyd.h +++ b/include/linux/dm-kcopyd.h @@ -62,9 +62,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc); typedef void (*dm_kcopyd_notify_fn)(int read_err, unsigned long write_err, void *context); -int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, - unsigned num_dests, struct dm_io_region *dests, - unsigned flags, dm_kcopyd_notify_fn fn, void *context); +void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context); /* * Prepare a callback and submit it via the kcopyd thread. @@ -81,9 +81,9 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, dm_kcopyd_notify_fn fn, void *context); void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err); -int dm_kcopyd_zero(struct dm_kcopyd_client *kc, - unsigned num_dests, struct dm_io_region *dests, - unsigned flags, dm_kcopyd_notify_fn fn, void *context); +void dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context); #endif /* __KERNEL__ */ #endif /* _LINUX_DM_KCOPYD_H */ From 75294442d896f2767be34f75aca7cc2b0d01301f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 2 Aug 2018 16:18:24 +0800 Subject: [PATCH 19/26] dm thin: stop no_space_timeout worker when switching to write-mode Now both check_for_space() and do_no_space_timeout() will read & write pool->pf.error_if_no_space. If these functions run concurrently, as shown in the following case, the default setting of "queue_if_no_space" can get lost. precondition: * error_if_no_space = false (aka "queue_if_no_space") * pool is in Out-of-Data-Space (OODS) mode * no_space_timeout worker has been queued CPU 0: CPU 1: // delete a thin device process_delete_mesg() // check_for_space() invoked by commit() set_pool_mode(pool, PM_WRITE) pool->pf.error_if_no_space = \ pt->requested_pf.error_if_no_space // timeout, pool is still in OODS mode do_no_space_timeout // "queue_if_no_space" config is lost pool->pf.error_if_no_space = true pool->pf.mode = new_mode Fix it by stopping no_space_timeout worker when switching to write mode. Fixes: bcc696fac11f ("dm thin: stay in out-of-data-space mode once no_space_timeout expires") Cc: stable@vger.kernel.org Signed-off-by: Hou Tao Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 5997d6808b57..7bd60a150f8f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2503,6 +2503,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + if (old_mode == PM_OUT_OF_DATA_SPACE) + cancel_delayed_work_sync(&pool->no_space_timeout); pool->out_of_data_space = false; pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); From fd2fa95416188a767a63979296fa3e169a9ef5ec Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 2 Aug 2018 16:08:52 -0400 Subject: [PATCH 20/26] dm cache metadata: save in-core policy_hint_size to on-disk superblock policy_hint_size starts as 0 during __write_initial_superblock(). It isn't until the policy is loaded that policy_hint_size is set in-core (cmd->policy_hint_size). But it never got recorded in the on-disk superblock because __commit_transaction() didn't deal with transfering the in-core cmd->policy_hint_size to the on-disk superblock. The in-core cmd->policy_hint_size gets initialized by metadata_open()'s __begin_transaction_flags() which re-reads all superblock fields. Because the superblock's policy_hint_size was never properly stored, when the cache was created, hints_array_available() would always return false when re-activating a previously created cache. This means __load_mappings() always considered the hints invalid and never made use of the hints (these hints served to optimize). Another detremental side-effect of this oversight is the cache_check utility would fail with: "invalid hint width: 0" Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 0d7212410e21..1a449105b007 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -363,7 +363,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->version = cpu_to_le32(cmd->version); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); - disk_super->policy_hint_size = 0; + disk_super->policy_hint_size = cpu_to_le32(0); __copy_sm_root(cmd, disk_super); @@ -701,6 +701,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); + disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); From 784c9a29e99eb40b842c29ecf1cc3a79e00fb629 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 6 Aug 2018 15:53:12 -0400 Subject: [PATCH 21/26] dm kcopyd: avoid softlockup in run_complete_job It was reported that softlockups occur when using dm-snapshot ontop of slow (rbd) storage. E.g.: [ 4047.990647] watchdog: BUG: soft lockup - CPU#10 stuck for 22s! [kworker/10:23:26177] ... [ 4048.034151] Workqueue: kcopyd do_work [dm_mod] [ 4048.034156] RIP: 0010:copy_callback+0x41/0x160 [dm_snapshot] ... [ 4048.034190] Call Trace: [ 4048.034196] ? __chunk_is_tracked+0x70/0x70 [dm_snapshot] [ 4048.034200] run_complete_job+0x5f/0xb0 [dm_mod] [ 4048.034205] process_jobs+0x91/0x220 [dm_mod] [ 4048.034210] ? kcopyd_put_pages+0x40/0x40 [dm_mod] [ 4048.034214] do_work+0x46/0xa0 [dm_mod] [ 4048.034219] process_one_work+0x171/0x370 [ 4048.034221] worker_thread+0x1fc/0x3f0 [ 4048.034224] kthread+0xf8/0x130 [ 4048.034226] ? max_active_store+0x80/0x80 [ 4048.034227] ? kthread_bind+0x10/0x10 [ 4048.034231] ret_from_fork+0x35/0x40 [ 4048.034233] Kernel panic - not syncing: softlockup: hung tasks Fix this by calling cond_resched() after run_complete_job()'s callout to the dm_kcopyd_notify_fn (which is dm-snap.c:copy_callback in the above trace). Signed-off-by: John Pittman Signed-off-by: Mike Snitzer --- drivers/md/dm-kcopyd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index cc101f3ec42c..2fc4213e02b5 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -487,6 +487,8 @@ static int run_complete_job(struct kcopyd_job *job) if (atomic_dec_and_test(&kc->nr_jobs)) wake_up(&kc->destroyq); + cond_resched(); + return 0; } From 3db2776d9fca45305e6c2065905d9a0e7b2c8212 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Tue, 7 Aug 2018 16:56:00 -0400 Subject: [PATCH 22/26] dm snapshot: improve performance by switching out_of_order_list to rbtree copy_complete()'s processing of out_of_order_list can result in quadratic complexity in the worst case. As such it was the source of consuming too much cpu and the source of significant loss in performance. Fix this by converting out_of_order_list to an rbtree. This improved a dm-snapshot test copy workload from 32 seconds to 4 seconds. Signed-off-by: David Jeffery Signed-off-by: Mikulas Patocka Tested-by: Brett Hull Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 97de7a7334d4..6f72ac7bbf9a 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -85,7 +85,7 @@ struct dm_snapshot { * A list of pending exceptions that completed out of order. * Protected by kcopyd single-threaded callback. */ - struct list_head out_of_order_list; + struct rb_root out_of_order_tree; mempool_t pending_pool; @@ -200,7 +200,7 @@ struct dm_snap_pending_exception { /* A sequence number, it is used for in-order completion. */ sector_t exception_sequence; - struct list_head out_of_order_entry; + struct rb_node out_of_order_node; /* * For writing a complete chunk, bypassing the copy. @@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) atomic_set(&s->pending_exceptions_count, 0); s->exception_start_sequence = 0; s->exception_complete_sequence = 0; - INIT_LIST_HEAD(&s->out_of_order_list); + s->out_of_order_tree = RB_ROOT; mutex_init(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1539,28 +1539,41 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) pe->copy_error = read_err || write_err; if (pe->exception_sequence == s->exception_complete_sequence) { + struct rb_node *next; + s->exception_complete_sequence++; complete_exception(pe); - while (!list_empty(&s->out_of_order_list)) { - pe = list_entry(s->out_of_order_list.next, - struct dm_snap_pending_exception, out_of_order_entry); + next = rb_first(&s->out_of_order_tree); + while (next) { + pe = rb_entry(next, struct dm_snap_pending_exception, + out_of_order_node); if (pe->exception_sequence != s->exception_complete_sequence) break; + next = rb_next(next); s->exception_complete_sequence++; - list_del(&pe->out_of_order_entry); + rb_erase(&pe->out_of_order_node, &s->out_of_order_tree); complete_exception(pe); + cond_resched(); } } else { - struct list_head *lh; + struct rb_node *parent = NULL; + struct rb_node **p = &s->out_of_order_tree.rb_node; struct dm_snap_pending_exception *pe2; - list_for_each_prev(lh, &s->out_of_order_list) { - pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); - if (pe2->exception_sequence < pe->exception_sequence) - break; + while (*p) { + pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node); + parent = *p; + + BUG_ON(pe->exception_sequence == pe2->exception_sequence); + if (pe->exception_sequence < pe2->exception_sequence) + p = &((*p)->rb_left); + else + p = &((*p)->rb_right); } - list_add(&pe->out_of_order_entry, lh); + + rb_link_node(&pe->out_of_order_node, parent, p); + rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } } From c9a5e6a968bd328753b694d19b952068c65dc5e7 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 8 Aug 2018 20:50:58 -0400 Subject: [PATCH 23/26] dm snapshot: remove stale FIXME in snapshot_map() Commit ae1093be ("dm snapshot: use mutex instead of rw_semaphore") eliminated the need to worry about read vs write locking. So remove a FIXME in snapshot_map() that is concerned about selectively taking a write lock. Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6f72ac7bbf9a..ae4b33d10924 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1707,8 +1707,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; - /* FIXME: should only take write lock if we need - * to copy an exception */ mutex_lock(&s->lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && From 5b1fe7bec8a8d0cc547a22e7ddc2bd59acd67de4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Aug 2018 12:38:28 +0200 Subject: [PATCH 24/26] dm cache metadata: set dirty on all cache blocks after a crash Quoting Documentation/device-mapper/cache.txt: The 'dirty' state for a cache block changes far too frequently for us to keep updating it on the fly. So we treat it as a hint. In normal operation it will be written when the dm device is suspended. If the system crashes all cache blocks will be assumed dirty when restarted. This got broken in commit f177940a8091 ("dm cache metadata: switch to using the new cursor api for loading metadata") in 4.9, which removed the code that consulted cmd->clean_when_opened (CLEAN_SHUTDOWN on-disk flag) when loading cache blocks. This results in data corruption on an unclean shutdown with dirty cache blocks on the fast device. After the crash those blocks are considered clean and may get evicted from the cache at any time. This can be demonstrated by doing a lot of reads to trigger individual evictions, but uncache is more predictable: ### Disable auto-activation in lvm.conf to be able to do uncache in ### time (i.e. see uncache doing flushing) when the fix is applied. # xfs_io -d -c 'pwrite -b 4M -S 0xaa 0 1G' /dev/vdb # vgcreate vg_cache /dev/vdb /dev/vdc # lvcreate -L 1G -n lv_slowdev vg_cache /dev/vdb # lvcreate -L 512M -n lv_cachedev vg_cache /dev/vdc # lvcreate -L 256M -n lv_metadev vg_cache /dev/vdc # lvconvert --type cache-pool --cachemode writeback vg_cache/lv_cachedev --poolmetadata vg_cache/lv_metadev # lvconvert --type cache vg_cache/lv_slowdev --cachepool vg_cache/lv_cachedev # xfs_io -d -c 'pwrite -b 4M -S 0xbb 0 512M' /dev/mapper/vg_cache-lv_slowdev # xfs_io -d -c 'pread -v 254M 512' /dev/mapper/vg_cache-lv_slowdev | head -n 2 0fe00000: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ 0fe00010: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ # dmsetup status vg_cache-lv_slowdev 0 2097152 cache 8 27/65536 128 8192/8192 1 100 0 0 0 8192 7065 2 metadata2 writeback 2 migration_threshold 2048 smq 0 rw - ^^^^ 7065 * 64k = 441M yet to be written to the slow device # echo b >/proc/sysrq-trigger # vgchange -ay vg_cache # xfs_io -d -c 'pread -v 254M 512' /dev/mapper/vg_cache-lv_slowdev | head -n 2 0fe00000: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ 0fe00010: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ # lvconvert --uncache vg_cache/lv_slowdev Flushing 0 blocks for cache vg_cache/lv_slowdev. Logical volume "lv_cachedev" successfully removed Logical volume vg_cache/lv_slowdev is not cached. # xfs_io -d -c 'pread -v 254M 512' /dev/mapper/vg_cache-lv_slowdev | head -n 2 0fe00000: aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa ................ 0fe00010: aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa ................ This is the case with both v1 and v2 cache pool metatata formats. After applying this patch: # vgchange -ay vg_cache # xfs_io -d -c 'pread -v 254M 512' /dev/mapper/vg_cache-lv_slowdev | head -n 2 0fe00000: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ 0fe00010: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ # lvconvert --uncache vg_cache/lv_slowdev Flushing 3724 blocks for cache vg_cache/lv_slowdev. ... Flushing 71 blocks for cache vg_cache/lv_slowdev. Logical volume "lv_cachedev" successfully removed Logical volume vg_cache/lv_slowdev is not cached. # xfs_io -d -c 'pread -v 254M 512' /dev/mapper/vg_cache-lv_slowdev | head -n 2 0fe00000: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ 0fe00010: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ Cc: stable@vger.kernel.org Fixes: f177940a8091 ("dm cache metadata: switch to using the new cursor api for loading metadata") Signed-off-by: Ilya Dryomov Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1a449105b007..69dddeab124c 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1323,6 +1323,7 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd, dm_oblock_t oblock; unsigned flags; + bool dirty = true; dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); memcpy(&mapping, mapping_value_le, sizeof(mapping)); @@ -1333,8 +1334,10 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd, dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); memcpy(&hint, hint_value_le, sizeof(hint)); } + if (cmd->clean_when_opened) + dirty = flags & M_DIRTY; - r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, + r = fn(context, oblock, to_cblock(cb), dirty, le32_to_cpu(hint), hints_valid); if (r) { DMERR("policy couldn't load cache block %llu", @@ -1362,7 +1365,7 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd, dm_oblock_t oblock; unsigned flags; - bool dirty; + bool dirty = true; dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); memcpy(&mapping, mapping_value_le, sizeof(mapping)); @@ -1373,8 +1376,9 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd, dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); memcpy(&hint, hint_value_le, sizeof(hint)); } + if (cmd->clean_when_opened) + dirty = dm_bitset_cursor_get_value(dirty_cursor); - dirty = dm_bitset_cursor_get_value(dirty_cursor); r = fn(context, oblock, to_cblock(cb), dirty, le32_to_cpu(hint), hints_valid); if (r) { From bc9e9cf0401f18e33b78d4c8a518661b8346baf7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 10 Aug 2018 11:23:56 -0400 Subject: [PATCH 25/26] dm crypt: don't decrease device limits dm-crypt should only increase device limits, it should not decrease them. This fixes a bug where the user could creates a crypt device with 1024 sector size on the top of scsi device that had 4096 logical block size. The limit 4096 would be lost and the user could incorrectly send 1024-I/Os to the crypt device. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d412bd4b911c..f266c81f396f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3061,11 +3061,11 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) */ limits->max_segment_size = PAGE_SIZE; - if (cc->sector_size != (1 << SECTOR_SHIFT)) { - limits->logical_block_size = cc->sector_size; - limits->physical_block_size = cc->sector_size; - blk_limits_io_min(limits, cc->sector_size); - } + limits->logical_block_size = + max_t(unsigned short, limits->logical_block_size, cc->sector_size); + limits->physical_block_size = + max_t(unsigned, limits->physical_block_size, cc->sector_size); + limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); } static struct target_type crypt_target = { From 1e1132ea21da6d7be92a72195204379c819cb70b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 16 Aug 2018 12:23:19 -0400 Subject: [PATCH 26/26] dm writecache: fix a crash due to reading past end of dirty_bitmap wc->dirty_bitmap_size is in bytes so must multiply it by 8, not by BITS_PER_LONG, to get number of bitmap_bits. Fixes crash in find_next_bit() that was reported: https://bugzilla.kernel.org/show_bug.cgi?id=200819 Reported-by: edo.rus@gmail.com Fixes: 48debafe4f2f ("dm: add writecache target") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e672e1d17bf1..3a28a68f184c 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -457,7 +457,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc) COMPLETION_INITIALIZER_ONSTACK(endio.c), ATOMIC_INIT(1), }; - unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG; + unsigned bitmap_bits = wc->dirty_bitmap_size * 8; unsigned i = 0; while (1) {