btrfs: introduce a new helper to submit write bio for repair

Both scrub and read-repair are utilizing a special repair writes that:

- Only writes back to a single device
  Even for read-repair on RAID56, we only update the corrupted data
  stripe itself, not triggering the full RMW path.

- Requires a valid @mirror_num
  For RAID56 case, only @mirror_num == 1 is valid.
  For non-RAID56 cases, we need @mirror_num to locate our stripe.

- No data csum generation needed

These two call sites still have some differences though:

- Read-repair goes plain bio
  It doesn't need a full btrfs_bio, and goes submit_bio_wait().

- New scrub repair would go btrfs_bio
  To simplify both read and write path.

So here this patch would:

- Introduce a common helper, btrfs_map_repair_block()
  Due to the single device nature, we can use an on-stack
  btrfs_io_stripe to pass device and its physical bytenr.

- Introduce a new interface, btrfs_submit_repair_bio(), for later scrub
  code
  This is for the incoming scrub code.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2023-03-20 10:12:49 +08:00 committed by David Sterba
parent 4317ff0056
commit 4886ff7b50
5 changed files with 132 additions and 44 deletions

View File

@ -735,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page, u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num) unsigned int pg_offset, int mirror_num)
{ {
struct btrfs_device *dev; struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec; struct bio_vec bvec;
struct bio bio; struct bio bio;
u64 map_length = 0;
u64 sector;
struct btrfs_io_context *bioc = NULL;
int ret = 0; int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@ -749,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical)) if (btrfs_repair_one_zone(fs_info, logical))
return 0; return 0;
map_length = length;
/* /*
* Avoid races with device replace and make sure our bioc has devices * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the * associated to its stripes that don't go away while we are doing the
* read repair operation. * read repair operation.
*/ */
btrfs_bio_counter_inc_blocked(fs_info); btrfs_bio_counter_inc_blocked(fs_info);
if (btrfs_is_parity_mirror(fs_info, logical, length)) { ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
/* if (ret < 0)
* Note that we don't use BTRFS_MAP_WRITE because it's supposed goto out_counter_dec;
* to update all raid stripes, but here we just want to correct
* bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bioc, 0);
if (ret)
goto out_counter_dec;
ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bioc, mirror_num);
if (ret)
goto out_counter_dec;
/*
* This happens when dev-replace is also running, and the
* mirror_num indicates the dev-replace target.
*
* In this case, we don't need to do anything, as the read
* error just means the replace progress hasn't reached our
* read range, and later replace routine would handle it well.
*/
if (mirror_num != bioc->mirror_num)
goto out_counter_dec;
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; if (!smap.dev->bdev ||
dev = bioc->stripes[bioc->mirror_num - 1].dev; !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
ret = -EIO; ret = -EIO;
goto out_counter_dec; goto out_counter_dec;
} }
bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = sector; bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, length, pg_offset); __bio_add_page(&bio, page, length, pg_offset);
btrfsic_check_bio(&bio); btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio); ret = submit_bio_wait(&bio);
if (ret) { if (ret) {
/* try to remap that extent elsewhere? */ /* try to remap that extent elsewhere? */
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
goto out_bio_uninit; goto out_bio_uninit;
} }
btrfs_info_rl_in_rcu(fs_info, btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)", "read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start, btrfs_dev_name(dev), sector); ino, start, btrfs_dev_name(smap.dev),
smap.physical >> SECTOR_SHIFT);
ret = 0; ret = 0;
out_bio_uninit: out_bio_uninit:
@ -820,6 +787,45 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
return ret; return ret;
} }
/*
* Submit a btrfs_bio based repair write.
*
* If @dev_replace is true, the write would be submitted to dev-replace target.
*/
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
{
struct btrfs_fs_info *fs_info = bbio->fs_info;
u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bbio->bio.bi_iter.bi_size;
struct btrfs_io_stripe smap = { 0 };
int ret;
ASSERT(fs_info);
ASSERT(mirror_num > 0);
ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
ASSERT(!bbio->inode);
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
if (ret < 0)
goto fail;
if (dev_replace) {
if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
bbio->bio.bi_opf &= ~REQ_OP_WRITE;
bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
}
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
smap.dev = fs_info->dev_replace.tgtdev;
}
__btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
return;
fail:
btrfs_bio_counter_dec(fs_info);
btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
}
int __init btrfs_bioset_init(void) int __init btrfs_bioset_init(void)
{ {
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,

View File

@ -98,6 +98,7 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page, u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num); unsigned int pg_offset, int mirror_num);

View File

@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
return map->num_stripes - btrfs_nr_parity_stripes(map->type); return map->num_stripes - btrfs_nr_parity_stripes(map->type);
} }
static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
{
return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
}
#define RAID5_P_STRIPE ((u64)-2) #define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1) #define RAID6_Q_STRIPE ((u64)-1)

View File

@ -8019,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
return true; return true;
} }
static void map_raid56_repair_block(struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap,
u64 logical)
{
int data_stripes = nr_bioc_data_stripes(bioc);
int i;
for (i = 0; i < data_stripes; i++) {
u64 stripe_start = bioc->full_stripe_logical +
(i << BTRFS_STRIPE_LEN_SHIFT);
if (logical >= stripe_start &&
logical < stripe_start + BTRFS_STRIPE_LEN)
break;
}
ASSERT(i < data_stripes);
smap->dev = bioc->stripes[i].dev;
smap->physical = bioc->stripes[i].physical +
((logical - bioc->full_stripe_logical) &
BTRFS_STRIPE_LEN_MASK);
}
/*
* Map a repair write into a single device.
*
* A repair write is triggered by read time repair or scrub, which would only
* update the contents of a single device.
* Not update any other mirrors nor go through RMW path.
*
* Callers should ensure:
*
* - Call btrfs_bio_counter_inc_blocked() first
* - The range does not cross stripe boundary
* - Has a valid @mirror_num passed in.
*/
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num)
{
struct btrfs_io_context *bioc = NULL;
u64 map_length = length;
int mirror_ret = mirror_num;
int ret;
ASSERT(mirror_num > 0);
ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
&bioc, smap, &mirror_ret, true);
if (ret < 0)
return ret;
/* The map range should not cross stripe boundary. */
ASSERT(map_length >= length);
/* Already mapped to single stripe. */
if (!bioc)
goto out;
/* Map the RAID56 multi-stripe writes to a single one. */
if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
map_raid56_repair_block(bioc, smap, logical);
goto out;
}
ASSERT(mirror_num <= bioc->num_stripes);
smap->dev = bioc->stripes[mirror_num - 1].dev;
smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
btrfs_put_bioc(bioc);
ASSERT(smap->dev);
return 0;
}

View File

@ -587,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret, struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret,
int need_raid_map); int need_raid_map);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret, u64 logical, u64 *length_ret,
u32 *num_stripes); u32 *num_stripes);