mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-12-29 09:16:33 +00:00
block: Prevent potential deadlocks in zone write plug error recovery
Zone write plugging for handling writes to zones of a zoned block
device always execute a zone report whenever a write BIO to a zone
fails. The intent of this is to ensure that the tracking of a zone write
pointer is always correct to ensure that the alignment to a zone write
pointer of write BIOs can be checked on submission and that we can
always correctly emulate zone append operations using regular write
BIOs.
However, this error recovery scheme introduces a potential deadlock if a
device queue freeze is initiated while BIOs are still plugged in a zone
write plug and one of these write operation fails. In such case, the
disk zone write plug error recovery work is scheduled and executes a
report zone. This in turn can result in a request allocation in the
underlying driver to issue the report zones command to the device. But
with the device queue freeze already started, this allocation will
block, preventing the report zone execution and the continuation of the
processing of the plugged BIOs. As plugged BIOs hold a queue usage
reference, the queue freeze itself will never complete, resulting in a
deadlock.
Avoid this problem by completely removing from the zone write plugging
code the use of report zones operations after a failed write operation,
instead relying on the device user to either execute a report zones,
reset the zone, finish the zone, or give up writing to the device (which
is a fairly common pattern for file systems which degrade to read-only
after write failures). This is not an unreasonnable requirement as all
well-behaved applications, FSes and device mapper already use report
zones to recover from write errors whenever possible by comparing the
current position of a zone write pointer with what their assumption
about the position is.
The changes to remove the automatic error recovery are as follows:
- Completely remove the error recovery work and its associated
resources (zone write plug list head, disk error list, and disk
zone_wplugs_work work struct). This also removes the functions
disk_zone_wplug_set_error() and disk_zone_wplug_clear_error().
- Change the BLK_ZONE_WPLUG_ERROR zone write plug flag into
BLK_ZONE_WPLUG_NEED_WP_UPDATE. This new flag is set for a zone write
plug whenever a write opration targetting the zone of the zone write
plug fails. This flag indicates that the zone write pointer offset is
not reliable and that it must be updated when the next report zone,
reset zone, finish zone or disk revalidation is executed.
- Modify blk_zone_write_plug_bio_endio() to set the
BLK_ZONE_WPLUG_NEED_WP_UPDATE flag for the target zone of a failed
write BIO.
- Modify the function disk_zone_wplug_set_wp_offset() to clear this
new flag, thus implementing recovery of a correct write pointer
offset with the reset (all) zone and finish zone operations.
- Modify blkdev_report_zones() to always use the disk_report_zones_cb()
callback so that disk_zone_wplug_sync_wp_offset() can be called for
any zone marked with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag.
This implements recovery of a correct write pointer offset for zone
write plugs marked with BLK_ZONE_WPLUG_NEED_WP_UPDATE and within
the range of the report zones operation executed by the user.
- Modify blk_revalidate_seq_zone() to call
disk_zone_wplug_sync_wp_offset() for all sequential write required
zones when a zoned block device is revalidated, thus always resolving
any inconsistency between the write pointer offset of zone write
plugs and the actual write pointer position of sequential zones.
Fixes: dd291d77cc
("block: Introduce zone write plugging")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20241209122357.47838-5-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
b76b840fd9
commit
fe0418eb9b
@ -41,7 +41,6 @@ static const char *const zone_cond_name[] = {
|
|||||||
/*
|
/*
|
||||||
* Per-zone write plug.
|
* Per-zone write plug.
|
||||||
* @node: hlist_node structure for managing the plug using a hash table.
|
* @node: hlist_node structure for managing the plug using a hash table.
|
||||||
* @link: To list the plug in the zone write plug error list of the disk.
|
|
||||||
* @ref: Zone write plug reference counter. A zone write plug reference is
|
* @ref: Zone write plug reference counter. A zone write plug reference is
|
||||||
* always at least 1 when the plug is hashed in the disk plug hash table.
|
* always at least 1 when the plug is hashed in the disk plug hash table.
|
||||||
* The reference is incremented whenever a new BIO needing plugging is
|
* The reference is incremented whenever a new BIO needing plugging is
|
||||||
@ -63,7 +62,6 @@ static const char *const zone_cond_name[] = {
|
|||||||
*/
|
*/
|
||||||
struct blk_zone_wplug {
|
struct blk_zone_wplug {
|
||||||
struct hlist_node node;
|
struct hlist_node node;
|
||||||
struct list_head link;
|
|
||||||
refcount_t ref;
|
refcount_t ref;
|
||||||
spinlock_t lock;
|
spinlock_t lock;
|
||||||
unsigned int flags;
|
unsigned int flags;
|
||||||
@ -80,8 +78,8 @@ struct blk_zone_wplug {
|
|||||||
* - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
|
* - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
|
||||||
* that is, that write BIOs are being throttled due to a write BIO already
|
* that is, that write BIOs are being throttled due to a write BIO already
|
||||||
* being executed or the zone write plug bio list is not empty.
|
* being executed or the zone write plug bio list is not empty.
|
||||||
* - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
|
* - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
|
||||||
* recovered with a report zone to update the zone write pointer offset.
|
* write pointer offset and need to update it.
|
||||||
* - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
|
* - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
|
||||||
* from the disk hash table and that the initial reference to the zone
|
* from the disk hash table and that the initial reference to the zone
|
||||||
* write plug set when the plug was first added to the hash table has been
|
* write plug set when the plug was first added to the hash table has been
|
||||||
@ -91,11 +89,9 @@ struct blk_zone_wplug {
|
|||||||
* freed once all remaining references from BIOs or functions are dropped.
|
* freed once all remaining references from BIOs or functions are dropped.
|
||||||
*/
|
*/
|
||||||
#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
|
#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
|
||||||
#define BLK_ZONE_WPLUG_ERROR (1U << 1)
|
#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
|
||||||
#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
|
#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
|
||||||
|
|
||||||
#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
|
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
|
||||||
* @zone_cond: BLK_ZONE_COND_XXX.
|
* @zone_cond: BLK_ZONE_COND_XXX.
|
||||||
@ -163,6 +159,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
|
|||||||
{
|
{
|
||||||
struct gendisk *disk = bdev->bd_disk;
|
struct gendisk *disk = bdev->bd_disk;
|
||||||
sector_t capacity = get_capacity(disk);
|
sector_t capacity = get_capacity(disk);
|
||||||
|
struct disk_report_zones_cb_args args = {
|
||||||
|
.disk = disk,
|
||||||
|
.user_cb = cb,
|
||||||
|
.user_data = data,
|
||||||
|
};
|
||||||
|
|
||||||
if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
|
if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
|
||||||
return -EOPNOTSUPP;
|
return -EOPNOTSUPP;
|
||||||
@ -170,7 +171,8 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
|
|||||||
if (!nr_zones || sector >= capacity)
|
if (!nr_zones || sector >= capacity)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
|
return disk->fops->report_zones(disk, sector, nr_zones,
|
||||||
|
disk_report_zones_cb, &args);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(blkdev_report_zones);
|
EXPORT_SYMBOL_GPL(blkdev_report_zones);
|
||||||
|
|
||||||
@ -451,7 +453,7 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
|
|||||||
{
|
{
|
||||||
if (refcount_dec_and_test(&zwplug->ref)) {
|
if (refcount_dec_and_test(&zwplug->ref)) {
|
||||||
WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
|
WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
|
||||||
WARN_ON_ONCE(!list_empty(&zwplug->link));
|
WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
|
||||||
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
|
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
|
||||||
|
|
||||||
call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
|
call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
|
||||||
@ -465,8 +467,8 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
|
|||||||
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
|
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/* If the zone write plug is still busy, it cannot be removed. */
|
/* If the zone write plug is still plugged, it cannot be removed. */
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
|
if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -549,7 +551,6 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
|
|||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
INIT_HLIST_NODE(&zwplug->node);
|
INIT_HLIST_NODE(&zwplug->node);
|
||||||
INIT_LIST_HEAD(&zwplug->link);
|
|
||||||
refcount_set(&zwplug->ref, 2);
|
refcount_set(&zwplug->ref, 2);
|
||||||
spin_lock_init(&zwplug->lock);
|
spin_lock_init(&zwplug->lock);
|
||||||
zwplug->flags = 0;
|
zwplug->flags = 0;
|
||||||
@ -598,115 +599,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Abort (fail) all plugged BIOs of a zone write plug that are not aligned
|
* Set a zone write plug write pointer offset to the specified value.
|
||||||
* with the assumed write pointer location of the zone when the BIO will
|
* This aborts all plugged BIOs, which is fine as this function is called for
|
||||||
* be unplugged.
|
* a zone reset operation, a zone finish operation or if the zone needs a wp
|
||||||
*/
|
* update from a report zone after a write error.
|
||||||
static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
|
|
||||||
struct blk_zone_wplug *zwplug)
|
|
||||||
{
|
|
||||||
unsigned int wp_offset = zwplug->wp_offset;
|
|
||||||
struct bio_list bl = BIO_EMPTY_LIST;
|
|
||||||
struct bio *bio;
|
|
||||||
|
|
||||||
while ((bio = bio_list_pop(&zwplug->bio_list))) {
|
|
||||||
if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) ||
|
|
||||||
(bio_op(bio) != REQ_OP_ZONE_APPEND &&
|
|
||||||
bio_offset_from_zone_start(bio) != wp_offset)) {
|
|
||||||
blk_zone_wplug_bio_io_error(zwplug, bio);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
wp_offset += bio_sectors(bio);
|
|
||||||
bio_list_add(&bl, bio);
|
|
||||||
}
|
|
||||||
|
|
||||||
bio_list_merge(&zwplug->bio_list, &bl);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void disk_zone_wplug_set_error(struct gendisk *disk,
|
|
||||||
struct blk_zone_wplug *zwplug)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* At this point, we already have a reference on the zone write plug.
|
|
||||||
* However, since we are going to add the plug to the disk zone write
|
|
||||||
* plugs work list, increase its reference count. This reference will
|
|
||||||
* be dropped in disk_zone_wplugs_work() once the error state is
|
|
||||||
* handled, or in disk_zone_wplug_clear_error() if the zone is reset or
|
|
||||||
* finished.
|
|
||||||
*/
|
|
||||||
zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
|
|
||||||
refcount_inc(&zwplug->ref);
|
|
||||||
|
|
||||||
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
|
|
||||||
list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
|
|
||||||
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
|
|
||||||
struct blk_zone_wplug *zwplug)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We are racing with the error handling work which drops the reference
|
|
||||||
* on the zone write plug after handling the error state. So remove the
|
|
||||||
* plug from the error list and drop its reference count only if the
|
|
||||||
* error handling has not yet started, that is, if the zone write plug
|
|
||||||
* is still listed.
|
|
||||||
*/
|
|
||||||
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
|
|
||||||
if (!list_empty(&zwplug->link)) {
|
|
||||||
list_del_init(&zwplug->link);
|
|
||||||
zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
|
|
||||||
disk_put_zone_wplug(zwplug);
|
|
||||||
}
|
|
||||||
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set a zone write plug write pointer offset to either 0 (zone reset case)
|
|
||||||
* or to the zone size (zone finish case). This aborts all plugged BIOs, which
|
|
||||||
* is fine to do as doing a zone reset or zone finish while writes are in-flight
|
|
||||||
* is a mistake from the user which will most likely cause all plugged BIOs to
|
|
||||||
* fail anyway.
|
|
||||||
*/
|
*/
|
||||||
static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
|
static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
|
||||||
struct blk_zone_wplug *zwplug,
|
struct blk_zone_wplug *zwplug,
|
||||||
unsigned int wp_offset)
|
unsigned int wp_offset)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
lockdep_assert_held(&zwplug->lock);
|
||||||
|
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Make sure that a BIO completion or another zone reset or finish
|
|
||||||
* operation has not already removed the plug from the hash table.
|
|
||||||
*/
|
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
|
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update the zone write pointer and abort all plugged BIOs. */
|
/* Update the zone write pointer and abort all plugged BIOs. */
|
||||||
|
zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
|
||||||
zwplug->wp_offset = wp_offset;
|
zwplug->wp_offset = wp_offset;
|
||||||
disk_zone_wplug_abort(zwplug);
|
disk_zone_wplug_abort(zwplug);
|
||||||
|
|
||||||
/*
|
|
||||||
* Updating the write pointer offset puts back the zone
|
|
||||||
* in a good state. So clear the error flag and decrement the
|
|
||||||
* error count if we were in error state.
|
|
||||||
*/
|
|
||||||
disk_zone_wplug_clear_error(disk, zwplug);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The zone write plug now has no BIO plugged: remove it from the
|
* The zone write plug now has no BIO plugged: remove it from the
|
||||||
* hash table so that it cannot be seen. The plug will be freed
|
* hash table so that it cannot be seen. The plug will be freed
|
||||||
@ -714,8 +622,6 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
|
|||||||
*/
|
*/
|
||||||
if (disk_should_remove_zone_wplug(disk, zwplug))
|
if (disk_should_remove_zone_wplug(disk, zwplug))
|
||||||
disk_remove_zone_wplug(disk, zwplug);
|
disk_remove_zone_wplug(disk, zwplug);
|
||||||
|
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
|
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
|
||||||
@ -752,7 +658,7 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
|
if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
|
||||||
disk_zone_wplug_set_wp_offset(disk, zwplug,
|
disk_zone_wplug_set_wp_offset(disk, zwplug,
|
||||||
blk_zone_wp_offset(zone));
|
blk_zone_wp_offset(zone));
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
@ -776,6 +682,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
|
|||||||
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
||||||
sector_t sector = bio->bi_iter.bi_sector;
|
sector_t sector = bio->bi_iter.bi_sector;
|
||||||
struct blk_zone_wplug *zwplug;
|
struct blk_zone_wplug *zwplug;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
/* Conventional zones cannot be reset nor finished. */
|
/* Conventional zones cannot be reset nor finished. */
|
||||||
if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
|
if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
|
||||||
@ -801,7 +708,9 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
|
|||||||
*/
|
*/
|
||||||
zwplug = disk_get_zone_wplug(disk, sector);
|
zwplug = disk_get_zone_wplug(disk, sector);
|
||||||
if (zwplug) {
|
if (zwplug) {
|
||||||
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
|
disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
|
||||||
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
disk_put_zone_wplug(zwplug);
|
disk_put_zone_wplug(zwplug);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -812,6 +721,7 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
|
|||||||
{
|
{
|
||||||
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
||||||
struct blk_zone_wplug *zwplug;
|
struct blk_zone_wplug *zwplug;
|
||||||
|
unsigned long flags;
|
||||||
sector_t sector;
|
sector_t sector;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -823,7 +733,9 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
|
|||||||
sector += disk->queue->limits.chunk_sectors) {
|
sector += disk->queue->limits.chunk_sectors) {
|
||||||
zwplug = disk_get_zone_wplug(disk, sector);
|
zwplug = disk_get_zone_wplug(disk, sector);
|
||||||
if (zwplug) {
|
if (zwplug) {
|
||||||
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
|
disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
|
||||||
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
disk_put_zone_wplug(zwplug);
|
disk_put_zone_wplug(zwplug);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1005,13 +917,23 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
|
|||||||
{
|
{
|
||||||
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we lost track of the zone write pointer due to a write error,
|
||||||
|
* the user must either execute a report zones, reset the zone or finish
|
||||||
|
* the to recover a reliable write pointer position. Fail BIOs if the
|
||||||
|
* user did not do that as we cannot handle emulated zone append
|
||||||
|
* otherwise.
|
||||||
|
*/
|
||||||
|
if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
|
||||||
|
return false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check that the user is not attempting to write to a full zone.
|
* Check that the user is not attempting to write to a full zone.
|
||||||
* We know such BIO will fail, and that would potentially overflow our
|
* We know such BIO will fail, and that would potentially overflow our
|
||||||
* write pointer offset beyond the end of the zone.
|
* write pointer offset beyond the end of the zone.
|
||||||
*/
|
*/
|
||||||
if (disk_zone_wplug_is_full(disk, zwplug))
|
if (disk_zone_wplug_is_full(disk, zwplug))
|
||||||
goto err;
|
return false;
|
||||||
|
|
||||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
||||||
/*
|
/*
|
||||||
@ -1030,24 +952,18 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
|
|||||||
bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
|
bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* Check for non-sequential writes early because we avoid a
|
* Check for non-sequential writes early as we know that BIOs
|
||||||
* whole lot of error handling trouble if we don't send it off
|
* with a start sector not unaligned to the zone write pointer
|
||||||
* to the driver.
|
* will fail.
|
||||||
*/
|
*/
|
||||||
if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
|
if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
|
||||||
goto err;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Advance the zone write pointer offset. */
|
/* Advance the zone write pointer offset. */
|
||||||
zwplug->wp_offset += bio_sectors(bio);
|
zwplug->wp_offset += bio_sectors(bio);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
err:
|
|
||||||
/* We detected an invalid write BIO: schedule error recovery. */
|
|
||||||
disk_zone_wplug_set_error(disk, zwplug);
|
|
||||||
kblockd_schedule_work(&disk->zone_wplugs_work);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
|
static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
|
||||||
@ -1097,20 +1013,20 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
|
|||||||
bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
|
bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the zone is already plugged or has a pending error, add the BIO
|
* If the zone is already plugged, add the BIO to the plug BIO list.
|
||||||
* to the plug BIO list. Do the same for REQ_NOWAIT BIOs to ensure that
|
* Do the same for REQ_NOWAIT BIOs to ensure that we will not see a
|
||||||
* we will not see a BLK_STS_AGAIN failure if we let the BIO execute.
|
* BLK_STS_AGAIN failure if we let the BIO execute.
|
||||||
* Otherwise, plug and let the BIO execute.
|
* Otherwise, plug and let the BIO execute.
|
||||||
*/
|
*/
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_BUSY || (bio->bi_opf & REQ_NOWAIT))
|
if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) ||
|
||||||
|
(bio->bi_opf & REQ_NOWAIT))
|
||||||
goto plug;
|
goto plug;
|
||||||
|
|
||||||
/*
|
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
|
||||||
* If an error is detected when preparing the BIO, add it to the BIO
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
* list so that error recovery can deal with it.
|
bio_io_error(bio);
|
||||||
*/
|
return true;
|
||||||
if (!blk_zone_wplug_prepare_bio(zwplug, bio))
|
}
|
||||||
goto plug;
|
|
||||||
|
|
||||||
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
|
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
|
||||||
|
|
||||||
@ -1210,16 +1126,6 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
|
|||||||
|
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
|
|
||||||
/*
|
|
||||||
* If we had an error, schedule error recovery. The recovery work
|
|
||||||
* will restart submission of plugged BIOs.
|
|
||||||
*/
|
|
||||||
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
|
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
|
||||||
kblockd_schedule_work(&disk->zone_wplugs_work);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Schedule submission of the next plugged BIO if we have one. */
|
/* Schedule submission of the next plugged BIO if we have one. */
|
||||||
if (!bio_list_empty(&zwplug->bio_list)) {
|
if (!bio_list_empty(&zwplug->bio_list)) {
|
||||||
disk_zone_wplug_schedule_bio_work(disk, zwplug);
|
disk_zone_wplug_schedule_bio_work(disk, zwplug);
|
||||||
@ -1262,12 +1168,13 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the BIO failed, mark the plug as having an error to trigger
|
* If the BIO failed, abort all plugged BIOs and mark the plug as
|
||||||
* recovery.
|
* needing a write pointer update.
|
||||||
*/
|
*/
|
||||||
if (bio->bi_status != BLK_STS_OK) {
|
if (bio->bi_status != BLK_STS_OK) {
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
disk_zone_wplug_set_error(disk, zwplug);
|
disk_zone_wplug_abort(zwplug);
|
||||||
|
zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1323,6 +1230,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
|
|||||||
*/
|
*/
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
spin_lock_irqsave(&zwplug->lock, flags);
|
||||||
|
|
||||||
|
again:
|
||||||
bio = bio_list_pop(&zwplug->bio_list);
|
bio = bio_list_pop(&zwplug->bio_list);
|
||||||
if (!bio) {
|
if (!bio) {
|
||||||
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
|
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
|
||||||
@ -1331,10 +1239,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
|
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
|
||||||
/* Error recovery will decide what to do with the BIO. */
|
blk_zone_wplug_bio_io_error(zwplug, bio);
|
||||||
bio_list_add_head(&zwplug->bio_list, bio);
|
goto again;
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
|
||||||
goto put_zwplug;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||||
@ -1356,97 +1262,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
|
|||||||
disk_put_zone_wplug(zwplug);
|
disk_put_zone_wplug(zwplug);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
|
|
||||||
unsigned int idx, void *data)
|
|
||||||
{
|
|
||||||
struct blk_zone *zonep = data;
|
|
||||||
|
|
||||||
*zonep = *zone;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void disk_zone_wplug_handle_error(struct gendisk *disk,
|
|
||||||
struct blk_zone_wplug *zwplug)
|
|
||||||
{
|
|
||||||
sector_t zone_start_sector =
|
|
||||||
bdev_zone_sectors(disk->part0) * zwplug->zone_no;
|
|
||||||
unsigned int noio_flag;
|
|
||||||
struct blk_zone zone;
|
|
||||||
unsigned long flags;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
/* Get the current zone information from the device. */
|
|
||||||
noio_flag = memalloc_noio_save();
|
|
||||||
ret = disk->fops->report_zones(disk, zone_start_sector, 1,
|
|
||||||
blk_zone_wplug_report_zone_cb, &zone);
|
|
||||||
memalloc_noio_restore(noio_flag);
|
|
||||||
|
|
||||||
spin_lock_irqsave(&zwplug->lock, flags);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* A zone reset or finish may have cleared the error already. In such
|
|
||||||
* case, do nothing as the report zones may have seen the "old" write
|
|
||||||
* pointer value before the reset/finish operation completed.
|
|
||||||
*/
|
|
||||||
if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
|
|
||||||
|
|
||||||
if (ret != 1) {
|
|
||||||
/*
|
|
||||||
* We failed to get the zone information, meaning that something
|
|
||||||
* is likely really wrong with the device. Abort all remaining
|
|
||||||
* plugged BIOs as otherwise we could endup waiting forever on
|
|
||||||
* plugged BIOs to complete if there is a queue freeze on-going.
|
|
||||||
*/
|
|
||||||
disk_zone_wplug_abort(zwplug);
|
|
||||||
goto unplug;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update the zone write pointer offset. */
|
|
||||||
zwplug->wp_offset = blk_zone_wp_offset(&zone);
|
|
||||||
disk_zone_wplug_abort_unaligned(disk, zwplug);
|
|
||||||
|
|
||||||
/* Restart BIO submission if we still have any BIO left. */
|
|
||||||
if (!bio_list_empty(&zwplug->bio_list)) {
|
|
||||||
disk_zone_wplug_schedule_bio_work(disk, zwplug);
|
|
||||||
goto unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
unplug:
|
|
||||||
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
|
|
||||||
if (disk_should_remove_zone_wplug(disk, zwplug))
|
|
||||||
disk_remove_zone_wplug(disk, zwplug);
|
|
||||||
|
|
||||||
unlock:
|
|
||||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void disk_zone_wplugs_work(struct work_struct *work)
|
|
||||||
{
|
|
||||||
struct gendisk *disk =
|
|
||||||
container_of(work, struct gendisk, zone_wplugs_work);
|
|
||||||
struct blk_zone_wplug *zwplug;
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
|
|
||||||
|
|
||||||
while (!list_empty(&disk->zone_wplugs_err_list)) {
|
|
||||||
zwplug = list_first_entry(&disk->zone_wplugs_err_list,
|
|
||||||
struct blk_zone_wplug, link);
|
|
||||||
list_del_init(&zwplug->link);
|
|
||||||
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
|
|
||||||
|
|
||||||
disk_zone_wplug_handle_error(disk, zwplug);
|
|
||||||
disk_put_zone_wplug(zwplug);
|
|
||||||
|
|
||||||
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
|
static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
|
||||||
{
|
{
|
||||||
return 1U << disk->zone_wplugs_hash_bits;
|
return 1U << disk->zone_wplugs_hash_bits;
|
||||||
@ -1455,8 +1270,6 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
|
|||||||
void disk_init_zone_resources(struct gendisk *disk)
|
void disk_init_zone_resources(struct gendisk *disk)
|
||||||
{
|
{
|
||||||
spin_lock_init(&disk->zone_wplugs_lock);
|
spin_lock_init(&disk->zone_wplugs_lock);
|
||||||
INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
|
|
||||||
INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1555,8 +1368,6 @@ void disk_free_zone_resources(struct gendisk *disk)
|
|||||||
if (!disk->zone_wplugs_pool)
|
if (!disk->zone_wplugs_pool)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
cancel_work_sync(&disk->zone_wplugs_work);
|
|
||||||
|
|
||||||
if (disk->zone_wplugs_wq) {
|
if (disk->zone_wplugs_wq) {
|
||||||
destroy_workqueue(disk->zone_wplugs_wq);
|
destroy_workqueue(disk->zone_wplugs_wq);
|
||||||
disk->zone_wplugs_wq = NULL;
|
disk->zone_wplugs_wq = NULL;
|
||||||
@ -1753,6 +1564,8 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
|
|||||||
if (!disk->zone_wplugs_hash)
|
if (!disk->zone_wplugs_hash)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
disk_zone_wplug_sync_wp_offset(disk, zone);
|
||||||
|
|
||||||
wp_offset = blk_zone_wp_offset(zone);
|
wp_offset = blk_zone_wp_offset(zone);
|
||||||
if (!wp_offset || wp_offset >= zone->capacity)
|
if (!wp_offset || wp_offset >= zone->capacity)
|
||||||
return 0;
|
return 0;
|
||||||
@ -1883,6 +1696,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
|
|||||||
memalloc_noio_restore(noio_flag);
|
memalloc_noio_restore(noio_flag);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
|
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
|
||||||
blk_revalidate_zone_cb, &args);
|
blk_revalidate_zone_cb, &args);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
|
@ -200,8 +200,6 @@ struct gendisk {
|
|||||||
spinlock_t zone_wplugs_lock;
|
spinlock_t zone_wplugs_lock;
|
||||||
struct mempool_s *zone_wplugs_pool;
|
struct mempool_s *zone_wplugs_pool;
|
||||||
struct hlist_head *zone_wplugs_hash;
|
struct hlist_head *zone_wplugs_hash;
|
||||||
struct list_head zone_wplugs_err_list;
|
|
||||||
struct work_struct zone_wplugs_work;
|
|
||||||
struct workqueue_struct *zone_wplugs_wq;
|
struct workqueue_struct *zone_wplugs_wq;
|
||||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user