From b6d2b054e8baaee53fd2d4854c63cbf0f2c6262a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 24 Aug 2021 10:05:20 -0700 Subject: [PATCH 1/4] mq-deadline: Fix request accounting The block layer may call the I/O scheduler .finish_request() callback without having called the .insert_requests() callback. Make sure that the mq-deadline I/O statistics are correct if the block layer inserts an I/O request that bypasses the I/O scheduler. This patch prevents that lower priority I/O is delayed longer than necessary for mixed I/O priority workloads. Cc: Niklas Cassel Cc: Damien Le Moal Cc: Hannes Reinecke Reported-by: Niklas Cassel Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210824170520.1659173-1-bvanassche@acm.org Reviewed-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Jens Axboe --- block/mq-deadline.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a09761cbdf12..18dc8efe9652 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -711,6 +711,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); + rq->elv.priv[0] = (void *)(uintptr_t)1; if (blk_mq_sched_try_insert_merge(q, rq, &free)) { blk_mq_free_requests(&free); @@ -759,12 +760,10 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ +/* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { + rq->elv.priv[0] = NULL; } /* @@ -791,7 +790,14 @@ static void dd_finish_request(struct request *rq) const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; - dd_count(dd, completed, prio); + /* + * The block layer core may call dd_finish_request() without having + * called dd_insert_requests(). Hence only update statistics for + * requests for which dd_insert_requests() has been called. See also + * blk_mq_request_bypass_insert(). + */ + if (rq->elv.priv[0]) + dd_count(dd, completed, prio); if (blk_queue_is_zoned(q)) { unsigned long flags; From 7b05bf771084ff788243b78f51bc2c820730951c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Aug 2021 12:59:44 -0600 Subject: [PATCH 2/4] Revert "block/mq-deadline: Prioritize high-priority requests" This reverts commit fb926032b3209300f9dc454a36b8299582ae545c. Zhen reports that this commit slows down mq-deadline on a 128 thread box, going from 258K IOPS to 170-180K. My testing shows that Optane gen2 IOPS goes from 2.3M IOPS to 1.2M IOPS on a 64 thread box. Looking in detail at the code, the main culprit here is needing to sum percpu counters in the dispatch hot path, leading to very high CPU utilization there. To make matters worse, the code currently needs to sum 2 percpu counters, and it does so in the most naive way of iterating possible CPUs _twice_. Since we're close to release, revert this commit and we can re-do it with regular per-priority counters instead for the 5.15 kernel. Link: https://lore.kernel.org/linux-block/20210826144039.2143-1-thunder.leizhen@huawei.com/ Reported-by: Zhen Lei Signed-off-by: Jens Axboe --- block/mq-deadline.c | 42 +++++------------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 18dc8efe9652..36920670dccc 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -31,11 +31,6 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ -/* - * Time after which to dispatch lower priority requests even if higher - * priority requests are pending. - */ -static const int aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ @@ -103,7 +98,6 @@ struct deadline_data { int writes_starved; int front_merges; u32 async_depth; - int aging_expire; spinlock_t lock; spinlock_t zone_lock; @@ -369,11 +363,10 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc and with a start time <= @latest. + * read/write expire, fifo_batch, etc */ static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio, - u64 latest_start_ns) + struct dd_per_prio *per_prio) { struct request *rq, *next_rq; enum dd_data_dir data_dir; @@ -385,8 +378,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); - if (rq->start_time_ns > latest_start_ns) - return NULL; list_del_init(&rq->queuelist); goto done; } @@ -464,8 +455,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, dd->batching = 0; dispatch_request: - if (rq->start_time_ns > latest_start_ns) - return NULL; /* * rq is the selected appropriate request. */ @@ -494,32 +483,15 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - const u64 now_ns = ktime_get_ns(); - struct request *rq = NULL; + struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); - /* - * Start with dispatching requests whose deadline expired more than - * aging_expire jiffies ago. - */ - for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns - - jiffies_to_nsecs(dd->aging_expire)); - if (rq) - goto unlock; - } - /* - * Next, dispatch requests in priority order. Ignore lower priority - * requests if any higher priority requests are pending. - */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns); - if (rq || dd_queued(dd, prio)) + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) break; } - -unlock: spin_unlock(&dd->lock); return rq; @@ -620,7 +592,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; - dd->aging_expire = aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); @@ -842,7 +813,6 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); -SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->front_merges); @@ -872,7 +842,6 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); -STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); @@ -891,7 +860,6 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), - DD_ATTR(aging_expire), __ATTR_NULL }; From 3375dca0b542c747d29655cf52f7b2741ecebe0e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Aug 2021 13:00:23 +0300 Subject: [PATCH 3/4] pd: fix a NULL vs IS_ERR() check blk_mq_alloc_disk() returns error pointers, it doesn't return NULL so correct the check. Fixes: 262d431f9000 ("pd: use blk_mq_alloc_disk and blk_cleanup_disk") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20210827100023.GB9449@kili Signed-off-by: Jens Axboe --- drivers/block/paride/pd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 9b3298926356..675327df6aff 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -892,7 +892,7 @@ static void pd_probe_drive(struct pd_unit *disk) return; p = blk_mq_alloc_disk(&disk->tag_set, disk); - if (!p) { + if (IS_ERR(p)) { blk_mq_free_tag_set(&disk->tag_set); return; } From 222013f9ac30b9cec44301daa8dbd0aae38abffb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Aug 2021 18:32:50 +0200 Subject: [PATCH 4/4] cryptoloop: add a deprecation warning Support for cryptoloop has been officially marked broken and deprecated in favor of dm-crypt (which supports the same broken algorithms if needed) in Linux 2.6.4 (released in March 2004), and support for it has been entirely removed from losetup in util-linux 2.23 (released in April 2013). Add a warning and a deprecation schedule. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210827163250.255325-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 4 ++-- drivers/block/cryptoloop.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 63056cfd4b62..fbb3a558139f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -213,7 +213,7 @@ config BLK_DEV_LOOP_MIN_COUNT dynamically allocated with the /dev/loop-control interface. config BLK_DEV_CRYPTOLOOP - tristate "Cryptoloop Support" + tristate "Cryptoloop Support (DEPRECATED)" select CRYPTO select CRYPTO_CBC depends on BLK_DEV_LOOP @@ -225,7 +225,7 @@ config BLK_DEV_CRYPTOLOOP WARNING: This device is not safe for journaled file systems like ext3 or Reiserfs. Please use the Device Mapper crypto module instead, which can be configured to be on-disk compatible with the - cryptoloop device. + cryptoloop device. cryptoloop support will be removed in Linux 5.16. source "drivers/block/drbd/Kconfig" diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 3cabc335ae74..f0a91faa43a8 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c @@ -189,6 +189,8 @@ init_cryptoloop(void) if (rc) printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n"); + else + pr_warn("the cryptoloop driver has been deprecated and will be removed in in Linux 5.16\n"); return rc; }