2018-04-03 19:23:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-11-05 17:33:06 +01:00
|
|
|
/*
|
|
|
|
* Copyright (C) STRATO AG 2012. All rights reserved.
|
|
|
|
*/
|
2018-04-03 19:23:33 +02:00
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/math64.h>
|
2019-08-21 18:48:25 +02:00
|
|
|
#include "misc.h"
|
2012-11-05 17:33:06 +01:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "extent_map.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "print-tree.h"
|
|
|
|
#include "volumes.h"
|
|
|
|
#include "async-thread.h"
|
|
|
|
#include "check-integrity.h"
|
|
|
|
#include "rcu-string.h"
|
|
|
|
#include "dev-replace.h"
|
2014-06-03 11:36:02 +08:00
|
|
|
#include "sysfs.h"
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2020-01-23 15:44:50 +08:00
|
|
|
/*
|
|
|
|
* Device replace overview
|
|
|
|
*
|
|
|
|
* [Objective]
|
|
|
|
* To copy all extents (both new and on-disk) from source device to target
|
|
|
|
* device, while still keeping the filesystem read-write.
|
|
|
|
*
|
|
|
|
* [Method]
|
|
|
|
* There are two main methods involved:
|
|
|
|
*
|
|
|
|
* - Write duplication
|
|
|
|
*
|
|
|
|
* All new writes will be written to both target and source devices, so even
|
|
|
|
* if replace gets canceled, sources device still contans up-to-date data.
|
|
|
|
*
|
|
|
|
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
|
|
|
|
* Start: btrfs_dev_replace_start()
|
|
|
|
* End: btrfs_dev_replace_finishing()
|
|
|
|
* Content: Latest data/metadata
|
|
|
|
*
|
|
|
|
* - Copy existing extents
|
|
|
|
*
|
|
|
|
* This happens by re-using scrub facility, as scrub also iterates through
|
|
|
|
* existing extents from commit root.
|
|
|
|
*
|
|
|
|
* Location: scrub_write_block_to_dev_replace() from
|
|
|
|
* scrub_block_complete()
|
|
|
|
* Content: Data/meta from commit root.
|
|
|
|
*
|
|
|
|
* Due to the content difference, we need to avoid nocow write when dev-replace
|
|
|
|
* is happening. This is done by marking the block group read-only and waiting
|
|
|
|
* for NOCOW writes.
|
|
|
|
*
|
|
|
|
* After replace is done, the finishing part is done by swapping the target and
|
|
|
|
* source devices.
|
|
|
|
*
|
|
|
|
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
|
|
|
|
* btrfs_dev_replace_finishing()
|
|
|
|
*/
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret);
|
|
|
|
static void btrfs_dev_replace_update_device_in_mapping_tree(
|
|
|
|
struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev);
|
|
|
|
static int btrfs_dev_replace_kthread(void *data);
|
|
|
|
|
|
|
|
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
int item_size;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
u64 src_devid;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
|
|
|
|
if (ret) {
|
|
|
|
no_valid_dev_replace_entry_found:
|
|
|
|
ret = 0;
|
|
|
|
dev_replace->replace_state =
|
2019-08-08 12:32:44 +08:00
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
|
|
|
|
dev_replace->time_started = 0;
|
|
|
|
dev_replace->time_stopped = 0;
|
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->is_valid = 0;
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot = path->slots[0];
|
|
|
|
eb = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(eb, slot);
|
|
|
|
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
|
|
|
|
|
|
|
|
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"dev_replace entry found has unexpected size, ignore entry");
|
2012-11-05 17:33:06 +01:00
|
|
|
goto no_valid_dev_replace_entry_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_devid = btrfs_dev_replace_src_devid(eb, ptr);
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
|
|
|
|
dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
|
|
|
|
dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
|
|
|
|
dev_replace->time_stopped =
|
|
|
|
btrfs_dev_replace_time_stopped(eb, ptr);
|
|
|
|
atomic64_set(&dev_replace->num_write_errors,
|
|
|
|
btrfs_dev_replace_num_write_errors(eb, ptr));
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors,
|
|
|
|
btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
|
|
|
|
dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
|
|
|
|
dev_replace->committed_cursor_left = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2019-01-17 23:32:31 +08:00
|
|
|
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
|
2019-01-19 14:48:55 +08:00
|
|
|
src_devid, NULL, NULL, true);
|
2019-01-17 23:32:31 +08:00
|
|
|
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
|
2012-11-05 17:33:06 +01:00
|
|
|
BTRFS_DEV_REPLACE_DEVID,
|
2019-01-19 14:48:55 +08:00
|
|
|
NULL, NULL, true);
|
2012-11-05 17:33:06 +01:00
|
|
|
/*
|
|
|
|
* allow 'btrfs dev replace_cancel' if src/tgt device is
|
|
|
|
* missing
|
|
|
|
*/
|
|
|
|
if (!dev_replace->srcdev &&
|
2016-06-22 18:54:23 -04:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-05 17:33:06 +01:00
|
|
|
ret = -EIO;
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
|
|
|
src_devid);
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev &&
|
2016-06-22 18:54:23 -04:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-05 17:33:06 +01:00
|
|
|
ret = -EIO;
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
2013-08-20 13:20:08 +02:00
|
|
|
BTRFS_DEV_REPLACE_DEVID);
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
if (dev_replace->tgtdev) {
|
|
|
|
if (dev_replace->srcdev) {
|
|
|
|
dev_replace->tgtdev->total_bytes =
|
|
|
|
dev_replace->srcdev->total_bytes;
|
|
|
|
dev_replace->tgtdev->disk_total_bytes =
|
|
|
|
dev_replace->srcdev->disk_total_bytes;
|
2014-09-03 21:35:33 +08:00
|
|
|
dev_replace->tgtdev->commit_total_bytes =
|
|
|
|
dev_replace->srcdev->commit_total_bytes;
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->tgtdev->bytes_used =
|
|
|
|
dev_replace->srcdev->bytes_used;
|
2014-09-03 21:35:34 +08:00
|
|
|
dev_replace->tgtdev->commit_bytes_used =
|
|
|
|
dev_replace->srcdev->commit_bytes_used;
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
2017-12-04 12:54:55 +08:00
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2018-02-12 23:36:25 +08:00
|
|
|
|
|
|
|
WARN_ON(fs_info->fs_devices->rw_devices == 0);
|
|
|
|
dev_replace->tgtdev->io_width = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->io_align = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->sector_size = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->fs_info = fs_info;
|
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2015-08-19 14:55:00 +09:00
|
|
|
btrfs_free_path(path);
|
2012-11-05 17:33:06 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-20 16:09:48 +01:00
|
|
|
/*
|
|
|
|
* Initialize a new device for device replace target from a given source dev
|
|
|
|
* and path.
|
|
|
|
*
|
|
|
|
* Return 0 and new device in @device_out, otherwise return < 0
|
|
|
|
*/
|
|
|
|
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
|
|
|
|
const char *device_path,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device **device_out)
|
|
|
|
{
|
|
|
|
struct btrfs_device *device;
|
|
|
|
struct block_device *bdev;
|
|
|
|
struct list_head *devices;
|
|
|
|
struct rcu_string *name;
|
|
|
|
u64 devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
*device_out = NULL;
|
|
|
|
if (fs_info->fs_devices->seeding) {
|
|
|
|
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
|
|
|
|
fs_info->bdev_holder);
|
|
|
|
if (IS_ERR(bdev)) {
|
|
|
|
btrfs_err(fs_info, "target device %s is invalid!", device_path);
|
|
|
|
return PTR_ERR(bdev);
|
|
|
|
}
|
|
|
|
|
2019-05-14 13:54:38 +03:00
|
|
|
sync_blockdev(bdev);
|
2018-03-20 16:09:48 +01:00
|
|
|
|
|
|
|
devices = &fs_info->fs_devices->devices;
|
|
|
|
list_for_each_entry(device, devices, dev_list) {
|
|
|
|
if (device->bdev == bdev) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is in the filesystem!");
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (i_size_read(bdev->bd_inode) <
|
|
|
|
btrfs_device_get_total_bytes(srcdev)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is smaller than source device!");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
device = btrfs_alloc_device(NULL, &devid, NULL);
|
|
|
|
if (IS_ERR(device)) {
|
|
|
|
ret = PTR_ERR(device);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
name = rcu_string_strdup(device_path, GFP_KERNEL);
|
|
|
|
if (!name) {
|
|
|
|
btrfs_free_device(device);
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
rcu_assign_pointer(device->name, name);
|
|
|
|
|
|
|
|
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
|
|
|
|
device->generation = 0;
|
|
|
|
device->io_width = fs_info->sectorsize;
|
|
|
|
device->io_align = fs_info->sectorsize;
|
|
|
|
device->sector_size = fs_info->sectorsize;
|
|
|
|
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
|
|
|
|
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
|
|
|
|
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
|
|
|
|
device->commit_total_bytes = srcdev->commit_total_bytes;
|
|
|
|
device->commit_bytes_used = device->bytes_used;
|
|
|
|
device->fs_info = fs_info;
|
|
|
|
device->bdev = bdev;
|
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
|
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
|
|
|
|
device->mode = FMODE_EXCL;
|
|
|
|
device->dev_stats_valid = 1;
|
|
|
|
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
|
|
|
|
device->fs_devices = fs_info->fs_devices;
|
2019-05-14 13:54:39 +03:00
|
|
|
|
|
|
|
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
2018-03-20 16:09:48 +01:00
|
|
|
list_add(&device->dev_list, &fs_info->fs_devices->devices);
|
|
|
|
fs_info->fs_devices->num_devices++;
|
|
|
|
fs_info->fs_devices->open_devices++;
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
|
|
|
|
*device_out = device;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error:
|
|
|
|
blkdev_put(bdev, FMODE_EXCL);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
/*
|
|
|
|
* called from commit_transaction. Writes changed device replace state to
|
|
|
|
* disk.
|
|
|
|
*/
|
2019-03-20 16:51:44 +01:00
|
|
|
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
|
2012-11-05 17:33:06 +01:00
|
|
|
{
|
2019-03-20 16:51:44 +01:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-11-05 17:33:06 +01:00
|
|
|
int ret;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
if (!dev_replace->is_valid ||
|
|
|
|
!dev_replace->item_needs_writeback) {
|
2018-09-07 16:11:23 +02:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 16:11:23 +02:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0) {
|
2016-09-20 10:05:00 -04:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"error %d while searching for dev_replace item!",
|
|
|
|
ret);
|
2012-11-05 17:33:06 +01:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 &&
|
|
|
|
btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
|
|
|
|
/*
|
|
|
|
* need to delete old one and insert a new one.
|
|
|
|
* Since no attempt is made to recover any old state, if the
|
|
|
|
* dev_replace state is 'running', the data on the target
|
|
|
|
* drive is lost.
|
|
|
|
* It would be possible to recover the state: just make sure
|
|
|
|
* that the beginning of the item is never changed and always
|
|
|
|
* contains all the essential information. Then read this
|
|
|
|
* minimal set of information and use it as a base for the
|
|
|
|
* new state.
|
|
|
|
*/
|
|
|
|
ret = btrfs_del_item(trans, dev_root, path);
|
|
|
|
if (ret != 0) {
|
2016-09-20 10:05:00 -04:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"delete too small dev_replace item failed %d!",
|
|
|
|
ret);
|
2012-11-05 17:33:06 +01:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 1) {
|
|
|
|
/* need to insert a new item */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_insert_empty_item(trans, dev_root, path,
|
|
|
|
&key, sizeof(*ptr));
|
|
|
|
if (ret < 0) {
|
2016-09-20 10:05:00 -04:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"insert dev_replace item failed %d!", ret);
|
2012-11-05 17:33:06 +01:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[0];
|
|
|
|
ptr = btrfs_item_ptr(eb, path->slots[0],
|
|
|
|
struct btrfs_dev_replace_item);
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
if (dev_replace->srcdev)
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr,
|
|
|
|
dev_replace->srcdev->devid);
|
|
|
|
else
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
|
|
|
|
btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode);
|
|
|
|
btrfs_set_dev_replace_replace_state(eb, ptr,
|
|
|
|
dev_replace->replace_state);
|
|
|
|
btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
|
|
|
|
btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
|
|
|
|
btrfs_set_dev_replace_num_write_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_write_errors));
|
|
|
|
btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors));
|
|
|
|
dev_replace->cursor_left_last_write_of_item =
|
|
|
|
dev_replace->cursor_left;
|
|
|
|
btrfs_set_dev_replace_cursor_left(eb, ptr,
|
|
|
|
dev_replace->cursor_left_last_write_of_item);
|
|
|
|
btrfs_set_dev_replace_cursor_right(eb, ptr,
|
|
|
|
dev_replace->cursor_right);
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(eb);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-11-28 10:43:10 +08:00
|
|
|
static char* btrfs_dev_name(struct btrfs_device *device)
|
|
|
|
{
|
2018-02-24 19:43:56 +08:00
|
|
|
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
|
2017-11-28 10:43:10 +08:00
|
|
|
return "<missing disk>";
|
|
|
|
else
|
|
|
|
return rcu_str_deref(device->name);
|
|
|
|
}
|
|
|
|
|
2018-11-11 22:22:16 +08:00
|
|
|
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
|
2017-02-14 17:55:53 +01:00
|
|
|
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
|
|
|
|
int read_src)
|
2012-11-05 17:33:06 +01:00
|
|
|
{
|
2016-06-22 18:54:24 -04:00
|
|
|
struct btrfs_root *root = fs_info->dev_root;
|
2012-11-05 17:33:06 +01:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
int ret;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
|
|
|
struct btrfs_device *src_device = NULL;
|
|
|
|
|
2018-09-03 12:46:14 +03:00
|
|
|
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
|
|
|
|
srcdev_name);
|
|
|
|
if (IS_ERR(src_device))
|
|
|
|
return PTR_ERR(src_device);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 10:28:12 -07:00
|
|
|
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
|
|
|
|
btrfs_warn_in_rcu(fs_info,
|
|
|
|
"cannot replace device %s (devid %llu) due to active swapfile",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid);
|
|
|
|
return -ETXTBSY;
|
|
|
|
}
|
|
|
|
|
2015-08-14 18:33:02 +08:00
|
|
|
/*
|
|
|
|
* Here we commit the transaction to make sure commit_total_bytes
|
|
|
|
* of all the devices are updated.
|
|
|
|
*/
|
|
|
|
trans = btrfs_attach_transaction(root);
|
|
|
|
if (!IS_ERR(trans)) {
|
2016-09-09 21:39:03 -04:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2015-08-14 18:33:02 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
} else if (PTR_ERR(trans) != -ENOENT) {
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
|
2019-05-14 13:54:41 +03:00
|
|
|
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
|
|
|
|
src_device, &tgt_device);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-09-06 15:52:17 -04:00
|
|
|
ASSERT(0);
|
2016-03-24 18:48:14 +08:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
|
2019-05-14 13:54:42 +03:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-03-24 18:48:14 +08:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode = read_src;
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->srcdev = src_device;
|
|
|
|
dev_replace->tgtdev = tgt_device;
|
|
|
|
|
2016-03-24 18:48:12 +08:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
2015-10-08 09:01:03 +02:00
|
|
|
"dev_replace from %s (devid %llu) to %s started",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2012-11-05 17:33:06 +01:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* from now on, the writes to the srcdev are all duplicated to
|
|
|
|
* go to the tgtdev as well (refer to btrfs_map_block()).
|
|
|
|
*/
|
|
|
|
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
2018-06-12 17:18:25 +05:30
|
|
|
dev_replace->time_started = ktime_get_real_seconds();
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2016-03-29 14:17:48 -07:00
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2020-02-12 17:28:13 +08:00
|
|
|
ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
|
2015-08-14 18:33:07 +08:00
|
|
|
if (ret)
|
2016-09-20 10:05:02 -04:00
|
|
|
btrfs_err(fs_info, "kobj add dev failed %d", ret);
|
2015-08-14 18:33:07 +08:00
|
|
|
|
2017-06-23 09:48:21 -07:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2019-05-14 13:54:43 +03:00
|
|
|
/* Commit dev_replace state and reserve 1 item for it. */
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2012-11-05 17:33:06 +01:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-09-06 15:52:17 -04:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
2019-05-14 13:54:42 +03:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-09-09 21:39:03 -04:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2012-11-05 17:33:06 +01:00
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* the disk copy procedure reuses the scrub code */
|
|
|
|
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_get_total_bytes(src_device),
|
2012-11-05 17:33:06 +01:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
|
2016-03-24 18:48:12 +08:00
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2020-01-25 12:35:38 +01:00
|
|
|
if (ret == -EINPROGRESS)
|
2016-03-24 18:48:14 +08:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2014-10-13 12:42:12 +08:00
|
|
|
return ret;
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
leave:
|
2018-07-20 19:37:51 +03:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
2012-11-05 17:33:06 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
|
2016-03-24 18:48:14 +08:00
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
switch (args->start.cont_reading_from_srcdev_mode) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
|
|
|
|
args->start.tgtdev_name[0] == '\0')
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-06-22 18:54:24 -04:00
|
|
|
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
|
2016-03-24 18:48:14 +08:00
|
|
|
args->start.srcdevid,
|
|
|
|
args->start.srcdev_name,
|
|
|
|
args->start.cont_reading_from_srcdev_mode);
|
|
|
|
args->result = ret;
|
|
|
|
/* don't warn if EINPROGRESS, someone else might be running scrub */
|
2018-11-11 22:22:24 +08:00
|
|
|
if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
|
|
|
|
ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
|
|
|
|
return 0;
|
2016-03-24 18:48:14 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
/*
|
2016-05-19 21:18:45 -04:00
|
|
|
* blocked until all in-flight bios operations are finished.
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-05 01:04:49 +02:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
|
|
|
|
&fs_info->dev_replace.bio_counter));
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we have removed target device, it is safe to allow new bios request.
|
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-05 01:04:49 +02:00
|
|
|
wake_up(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 15:30:16 +01:00
|
|
|
/*
|
|
|
|
* When finishing the device replace, before swapping the source device with the
|
|
|
|
* target device we must update the chunk allocation state in the target device,
|
|
|
|
* as it is empty because replace works by directly copying the chunks and not
|
|
|
|
* through the normal chunk allocation path.
|
|
|
|
*/
|
|
|
|
static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
u64 start = 0;
|
|
|
|
u64 found_start;
|
|
|
|
u64 found_end;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
while (!find_first_extent_bit(&srcdev->alloc_state, start,
|
|
|
|
&found_start, &found_end,
|
|
|
|
CHUNK_ALLOCATED, &cached_state)) {
|
|
|
|
ret = set_extent_bits(&tgtdev->alloc_state, found_start,
|
|
|
|
found_end, CHUNK_ALLOCATED);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
start = found_end + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct btrfs_device *tgt_device;
|
|
|
|
struct btrfs_device *src_device;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
|
|
|
u8 uuid_tmp[BTRFS_UUID_SIZE];
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/* don't allow cancel or unmount to disturb the finishing procedure */
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
/* was the operation canceled, or is it finished? */
|
|
|
|
if (dev_replace->replace_state !=
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
|
2018-09-07 16:11:23 +02:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* flush all outstanding I/O and inode extent mappings before the
|
|
|
|
* copy operation is declared as being finished
|
|
|
|
*/
|
2020-07-21 10:22:12 -04:00
|
|
|
ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
|
2013-01-22 10:49:33 +00:00
|
|
|
if (ret) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-06-23 09:48:21 -07:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2019-05-17 10:44:25 +03:00
|
|
|
/*
|
|
|
|
* We have to use this loop approach because at this point src_device
|
|
|
|
* has to be available for transaction commit to complete, yet new
|
|
|
|
* chunks shouldn't be allocated on the device.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* Prevent write_all_supers() during the finishing procedure */
|
|
|
|
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
/* Prevent new chunks being allocated on the source device */
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
if (!list_empty(&src_device->post_commit_list)) {
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->replace_state =
|
|
|
|
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
|
|
|
|
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-06-12 17:18:25 +05:30
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 15:30:16 +01:00
|
|
|
/*
|
|
|
|
* Update allocation state in the new device and replace the old device
|
|
|
|
* with the new one in the mapping tree.
|
|
|
|
*/
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
if (!scrub_ret) {
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 15:30:16 +01:00
|
|
|
scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
|
|
|
|
if (scrub_ret)
|
|
|
|
goto error;
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
|
|
|
|
src_device,
|
|
|
|
tgt_device);
|
|
|
|
} else {
|
2018-11-20 19:56:16 +08:00
|
|
|
if (scrub_ret != -ECANCELED)
|
|
|
|
btrfs_err_in_rcu(fs_info,
|
2016-06-22 18:54:23 -04:00
|
|
|
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-22 18:54:23 -04:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name), scrub_ret);
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 15:30:16 +01:00
|
|
|
error:
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2016-06-22 18:54:23 -04:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
2012-11-05 17:33:06 +01:00
|
|
|
if (tgt_device)
|
2018-07-20 19:37:51 +03:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
2012-11-05 17:33:06 +01:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2014-10-13 12:42:12 +08:00
|
|
|
return scrub_ret;
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s finished",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-22 18:54:23 -04:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name));
|
2017-12-04 12:54:55 +08:00
|
|
|
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
|
2012-11-05 17:33:06 +01:00
|
|
|
tgt_device->devid = src_device->devid;
|
|
|
|
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
|
|
|
|
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
|
|
|
|
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
|
|
|
|
btrfs_device_set_disk_total_bytes(tgt_device,
|
|
|
|
src_device->disk_total_bytes);
|
|
|
|
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
|
2014-09-03 21:35:34 +08:00
|
|
|
tgt_device->commit_bytes_used = src_device->bytes_used;
|
2016-05-03 17:44:43 +08:00
|
|
|
|
2018-07-20 19:37:50 +03:00
|
|
|
btrfs_assign_next_active_device(src_device, tgt_device);
|
2016-05-03 17:44:43 +08:00
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
|
2014-09-03 21:35:44 +08:00
|
|
|
fs_info->fs_devices->rw_devices++;
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
|
|
|
|
2018-07-20 19:37:48 +03:00
|
|
|
btrfs_rm_dev_replace_remove_srcdev(src_device);
|
2013-10-02 20:41:01 +03:00
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
|
|
|
|
2018-07-31 16:20:21 +09:00
|
|
|
/*
|
|
|
|
* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
|
|
|
|
* update on-disk dev stats value during commit transaction
|
|
|
|
*/
|
|
|
|
atomic_inc(&tgt_device->dev_stats_ccnt);
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
/*
|
|
|
|
* this is again a consistent state where no dev_replace procedure
|
|
|
|
* is running, the target device is part of the filesystem, the
|
|
|
|
* source device is not part of the filesystem anymore and its 1st
|
|
|
|
* superblock is scratched out so that it is no longer marked to
|
|
|
|
* belong to this filesystem.
|
|
|
|
*/
|
2016-06-22 18:54:23 -04:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2014-10-30 16:52:31 +08:00
|
|
|
/* replace the sysfs entry */
|
2020-02-12 17:28:13 +08:00
|
|
|
btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
|
2020-01-06 19:38:31 +08:00
|
|
|
btrfs_sysfs_update_devid(tgt_device);
|
2020-08-20 11:18:26 -04:00
|
|
|
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
|
|
|
|
btrfs_scratch_superblocks(fs_info, src_device->bdev,
|
|
|
|
src_device->name->str);
|
2014-10-30 16:52:31 +08:00
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
/* write back the superblocks */
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (!IS_ERR(trans))
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_commit_transaction(trans);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks
When closing and freeing the source device we could end up doing our
final blkdev_put() on the bdev, which will grab the bd_mutex. As such
we want to be holding as few locks as possible, so move this call
outside of the dev_replace->lock_finishing_cancel_unmount lock. Since
we're modifying the fs_devices we need to make sure we're holding the
uuid_mutex here, so take that as well.
There's a report from syzbot probably hitting one of the cases where
the bd_mutex and device_list_mutex are taken in the wrong order, however
it's not with device replace, like this patch fixes. As there's no
reproducer available so far, we can't verify the fix.
https://lore.kernel.org/lkml/000000000000fc04d105afcf86d7@google.com/
dashboard link: https://syzkaller.appspot.com/bug?extid=84a0634dc5d21d488419
WARNING: possible circular locking dependency detected
5.9.0-rc5-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor.0/6878 is trying to acquire lock:
ffff88804c17d780 (&bdev->bd_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x520 fs/block_dev.c:1804
but task is already holding lock:
ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #4 (&fs_devs->device_list_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
btrfs_finish_chunk_alloc+0x281/0xf90 fs/btrfs/volumes.c:5255
btrfs_create_pending_block_groups+0x2f3/0x700 fs/btrfs/block-group.c:2109
__btrfs_end_transaction+0xf5/0x690 fs/btrfs/transaction.c:916
find_free_extent_update_loop fs/btrfs/extent-tree.c:3807 [inline]
find_free_extent+0x23b7/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #3 (sb_internal#2){.+.+}-{0:0}:
percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
__sb_start_write+0x234/0x470 fs/super.c:1672
sb_start_intwrite include/linux/fs.h:1690 [inline]
start_transaction+0xbe7/0x1170 fs/btrfs/transaction.c:624
find_free_extent_update_loop fs/btrfs/extent-tree.c:3789 [inline]
find_free_extent+0x25e1/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #2 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}:
__flush_work+0x60e/0xac0 kernel/workqueue.c:3041
wb_shutdown+0x180/0x220 mm/backing-dev.c:355
bdi_unregister+0x174/0x590 mm/backing-dev.c:872
del_gendisk+0x820/0xa10 block/genhd.c:933
loop_remove drivers/block/loop.c:2192 [inline]
loop_control_ioctl drivers/block/loop.c:2291 [inline]
loop_control_ioctl+0x3b1/0x480 drivers/block/loop.c:2257
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #1 (loop_ctl_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
lo_open+0x19/0xd0 drivers/block/loop.c:1893
__blkdev_get+0x759/0x1aa0 fs/block_dev.c:1507
blkdev_get fs/block_dev.c:1639 [inline]
blkdev_open+0x227/0x300 fs/block_dev.c:1753
do_dentry_open+0x4b9/0x11b0 fs/open.c:817
do_open fs/namei.c:3251 [inline]
path_openat+0x1b9a/0x2730 fs/namei.c:3368
do_filp_open+0x17e/0x3c0 fs/namei.c:3395
do_sys_openat2+0x16d/0x420 fs/open.c:1168
do_sys_open fs/open.c:1184 [inline]
__do_sys_open fs/open.c:1192 [inline]
__se_sys_open fs/open.c:1188 [inline]
__x64_sys_open+0x119/0x1c0 fs/open.c:1188
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #0 (&bdev->bd_mutex){+.+.}-{3:3}:
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
other info that might help us debug this:
Chain exists of:
&bdev->bd_mutex --> sb_internal#2 --> &fs_devs->device_list_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_devs->device_list_mutex);
lock(sb_internal#2);
lock(&fs_devs->device_list_mutex);
lock(&bdev->bd_mutex);
*** DEADLOCK ***
3 locks held by syz-executor.0/6878:
#0: ffff88809070c0e0 (&type->s_umount_key#70){++++}-{3:3}, at: deactivate_super+0xa5/0xd0 fs/super.c:365
#1: ffffffff8a5b37a8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_close_devices+0x23/0x1f0 fs/btrfs/volumes.c:1178
#2: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
stack backtrace:
CPU: 0 PID: 6878 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x198/0x1fd lib/dump_stack.c:118
check_noncircular+0x324/0x3e0 kernel/locking/lockdep.c:1827
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x460027
RSP: 002b:00007fff59216328 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 0000000000076035 RCX: 0000000000460027
RDX: 0000000000403188 RSI: 0000000000000002 RDI: 00007fff592163d0
RBP: 0000000000000333 R08: 0000000000000000 R09: 000000000000000b
R10: 0000000000000005 R11: 0000000000000246 R12: 00007fff59217460
R13: 0000000002df2a60 R14: 0000000000000000 R15: 00007fff59217460
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add syzbot reference ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-20 11:18:27 -04:00
|
|
|
btrfs_rm_dev_replace_free_srcdev(src_device);
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_dev_replace_update_device_in_mapping_tree(
|
|
|
|
struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
2019-05-17 11:43:17 +02:00
|
|
|
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
|
2012-11-05 17:33:06 +01:00
|
|
|
struct extent_map *em;
|
|
|
|
struct map_lookup *map;
|
|
|
|
u64 start = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
write_lock(&em_tree->lock);
|
|
|
|
do {
|
|
|
|
em = lookup_extent_mapping(em_tree, start, (u64)-1);
|
|
|
|
if (!em)
|
|
|
|
break;
|
2015-06-03 10:55:48 -04:00
|
|
|
map = em->map_lookup;
|
2012-11-05 17:33:06 +01:00
|
|
|
for (i = 0; i < map->num_stripes; i++)
|
|
|
|
if (srcdev == map->stripes[i].dev)
|
|
|
|
map->stripes[i].dev = tgtdev;
|
|
|
|
start = em->start + em->len;
|
|
|
|
free_extent_map(em);
|
|
|
|
} while (start);
|
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
}
|
|
|
|
|
2017-06-14 16:24:56 +02:00
|
|
|
/*
|
|
|
|
* Read progress of device replace status according to the state and last
|
|
|
|
* stored position. The value format is the same as for
|
|
|
|
* btrfs_dev_replace::progress_1000
|
|
|
|
*/
|
|
|
|
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 ret = 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
ret = 1000;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
ret = div64_u64(dev_replace->cursor_left,
|
|
|
|
div_u64(btrfs_device_get_total_bytes(
|
|
|
|
dev_replace->srcdev), 1000));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
/* even if !dev_replace_is_valid, the values are good enough for
|
|
|
|
* the replace_status ioctl */
|
|
|
|
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
args->status.replace_state = dev_replace->replace_state;
|
|
|
|
args->status.time_started = dev_replace->time_started;
|
|
|
|
args->status.time_stopped = dev_replace->time_stopped;
|
|
|
|
args->status.num_write_errors =
|
|
|
|
atomic64_read(&dev_replace->num_write_errors);
|
|
|
|
args->status.num_uncorrectable_read_errors =
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
|
2017-06-14 16:24:56 +02:00
|
|
|
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
|
2018-09-07 16:11:23 +02:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
|
2018-02-12 23:33:31 +08:00
|
|
|
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
|
2012-11-05 17:33:06 +01:00
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
2018-02-13 11:53:43 +08:00
|
|
|
struct btrfs_device *src_device = NULL;
|
2012-11-05 17:33:06 +01:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
2018-02-12 23:33:31 +08:00
|
|
|
int result;
|
2012-11-05 17:33:06 +01:00
|
|
|
int ret;
|
|
|
|
|
2017-07-17 08:45:34 +01:00
|
|
|
if (sb_rdonly(fs_info->sb))
|
2013-10-10 20:40:21 +03:00
|
|
|
return -EROFS;
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 13:50:26 +08:00
|
|
|
break;
|
2012-11-05 17:33:06 +01:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
2018-11-14 13:50:26 +08:00
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-11 22:22:20 +08:00
|
|
|
ret = btrfs_scrub_cancel(fs_info);
|
|
|
|
if (ret < 0) {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
|
|
|
} else {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
/*
|
|
|
|
* btrfs_dev_replace_finishing() will handle the
|
|
|
|
* cleanup part
|
|
|
|
*/
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
}
|
2018-11-14 13:50:26 +08:00
|
|
|
break;
|
2012-11-05 17:33:06 +01:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-11-14 13:50:26 +08:00
|
|
|
/*
|
|
|
|
* Scrub doing the replace isn't running so we need to do the
|
|
|
|
* cleanup step of btrfs_dev_replace_finishing() here
|
|
|
|
*/
|
2012-11-05 17:33:06 +01:00
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
2018-02-13 11:53:43 +08:00
|
|
|
src_device = dev_replace->srcdev;
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-11-14 13:50:26 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
|
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-02-13 11:53:43 +08:00
|
|
|
|
2018-11-11 22:22:21 +08:00
|
|
|
/* Scrub for replace must not be running in suspended state */
|
|
|
|
ret = btrfs_scrub_cancel(fs_info);
|
|
|
|
ASSERT(ret != -ENOTCONN);
|
2018-11-14 13:50:26 +08:00
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
2018-02-13 11:53:43 +08:00
|
|
|
|
2018-11-14 13:50:26 +08:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"suspended dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
|
|
|
|
if (tgt_device)
|
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
|
|
|
break;
|
|
|
|
default:
|
2019-02-11 21:32:10 +03:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 13:50:26 +08:00
|
|
|
result = -EINVAL;
|
|
|
|
}
|
2012-11-05 17:33:06 +01:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-06-12 17:18:25 +05:30
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-05 17:33:06 +01:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_info(fs_info, "suspending dev_replace for unmount");
|
2012-11-05 17:33:06 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* resume dev_replace procedure that was interrupted by unmount */
|
|
|
|
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct task_struct *task;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_info(fs_info,
|
2016-09-20 10:05:00 -04:00
|
|
|
"cannot continue dev_replace, tgtdev is missing");
|
|
|
|
btrfs_info(fs_info,
|
|
|
|
"you may cancel the operation after 'mount -o degraded'");
|
2018-11-11 22:22:17 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2018-03-20 19:51:04 +01:00
|
|
|
/*
|
|
|
|
* This could collide with a paused balance, but the exclusive op logic
|
|
|
|
* should never allow both to start and pause. We don't want to allow
|
|
|
|
* dev-replace to start anyway.
|
|
|
|
*/
|
2020-08-25 10:02:32 -05:00
|
|
|
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
|
2018-09-07 16:11:23 +02:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-11-11 22:22:18 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 16:11:23 +02:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-03-20 19:51:04 +01:00
|
|
|
btrfs_info(fs_info,
|
|
|
|
"cannot resume dev-replace, other exclusive operation running");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
|
2013-07-15 11:20:32 +09:30
|
|
|
return PTR_ERR_OR_ZERO(task);
|
2012-11-05 17:33:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_dev_replace_kthread(void *data)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = data;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 progress;
|
2018-03-20 15:35:50 +01:00
|
|
|
int ret;
|
2012-11-05 17:33:06 +01:00
|
|
|
|
2017-06-14 16:28:42 +02:00
|
|
|
progress = btrfs_dev_replace_progress(fs_info);
|
|
|
|
progress = div_u64(progress, 10);
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
2017-11-28 10:43:10 +08:00
|
|
|
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
|
|
|
|
btrfs_dev_name(dev_replace->srcdev),
|
2017-06-14 16:28:42 +02:00
|
|
|
dev_replace->srcdev->devid,
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(dev_replace->tgtdev),
|
2017-06-14 16:28:42 +02:00
|
|
|
(unsigned int)progress);
|
|
|
|
|
2012-11-05 17:33:06 +01:00
|
|
|
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
|
|
|
|
dev_replace->committed_cursor_left,
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_get_total_bytes(dev_replace->srcdev),
|
2012-11-05 17:33:06 +01:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2018-11-20 19:56:15 +08:00
|
|
|
WARN_ON(ret && ret != -ECANCELED);
|
2018-03-20 15:35:50 +01:00
|
|
|
|
2020-08-25 10:02:32 -05:00
|
|
|
btrfs_exclop_finish(fs_info);
|
2012-11-05 17:33:06 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-01 19:57:39 +02:00
|
|
|
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
|
2012-11-05 17:33:06 +01:00
|
|
|
{
|
|
|
|
if (!dev_replace->is_valid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
/*
|
|
|
|
* return true even if tgtdev is missing (this is
|
|
|
|
* something that can happen if the dev_replace
|
|
|
|
* procedure is suspended by an umount and then
|
|
|
|
* the tgtdev is missing (or "btrfs dev scan") was
|
2018-11-28 12:05:13 +01:00
|
|
|
* not called and the filesystem is remounted
|
2012-11-05 17:33:06 +01:00
|
|
|
* in degraded state. This does not stop the
|
|
|
|
* dev_replace procedure. It needs to be canceled
|
2016-03-04 11:23:12 -08:00
|
|
|
* manually if the cancellation is wanted.
|
2012-11-05 17:33:06 +01:00
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2018-04-05 01:04:49 +02:00
|
|
|
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
2014-11-25 16:39:28 +08:00
|
|
|
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
{
|
2018-04-05 01:04:49 +02:00
|
|
|
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
|
|
|
|
cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2015-01-20 15:11:37 +08:00
|
|
|
while (1) {
|
2018-04-05 01:04:49 +02:00
|
|
|
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
|
2015-01-20 15:11:37 +08:00
|
|
|
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state)))
|
|
|
|
break;
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_bio_counter_dec(fs_info);
|
2018-04-05 01:04:49 +02:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait,
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state));
|
|
|
|
}
|
|
|
|
}
|