2018-04-03 19:23:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011 STRATO. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/workqueue.h>
|
2013-01-29 06:04:50 +00:00
|
|
|
#include <linux/btrfs.h>
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
#include "ctree.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "locking.h"
|
|
|
|
#include "ulist.h"
|
|
|
|
#include "backref.h"
|
2013-04-25 16:04:51 +00:00
|
|
|
#include "extent_io.h"
|
2014-05-13 17:30:47 -07:00
|
|
|
#include "qgroup.h"
|
2019-06-20 15:37:44 -04:00
|
|
|
#include "block-group.h"
|
2020-06-28 13:07:15 +08:00
|
|
|
#include "sysfs.h"
|
2015-04-17 10:23:16 +08:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/* TODO XXX FIXME
|
|
|
|
* - subvol delete -> delete when ref goes to 0? delete limits also?
|
|
|
|
* - reorganize keys
|
|
|
|
* - compressed
|
|
|
|
* - sync
|
|
|
|
* - copy also limits on subvol creation
|
|
|
|
* - limit
|
2018-11-28 12:05:13 +01:00
|
|
|
* - caches for ulists
|
2012-06-28 18:03:02 +02:00
|
|
|
* - performance benchmarks
|
|
|
|
* - check all ioctl parameters
|
|
|
|
*/
|
|
|
|
|
2017-12-12 15:34:24 +08:00
|
|
|
/*
|
|
|
|
* Helpers to access qgroup reservation
|
|
|
|
*
|
|
|
|
* Callers should ensure the lock context and type are valid
|
|
|
|
*/
|
|
|
|
|
|
|
|
static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
|
|
|
|
{
|
|
|
|
u64 ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
|
|
|
ret += qgroup->rsv.values[i];
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type == BTRFS_QGROUP_RSV_DATA)
|
|
|
|
return "data";
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return "meta_pertrans";
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
|
|
|
|
return "meta_prealloc";
|
2017-12-12 15:34:24 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup, u64 num_bytes,
|
2017-12-12 15:34:24 +08:00
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
2017-12-12 15:34:27 +08:00
|
|
|
trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
|
2017-12-12 15:34:24 +08:00
|
|
|
qgroup->rsv.values[type] += num_bytes;
|
|
|
|
}
|
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup, u64 num_bytes,
|
2017-12-12 15:34:24 +08:00
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
2017-12-12 15:34:27 +08:00
|
|
|
trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
|
2017-12-12 15:34:24 +08:00
|
|
|
if (qgroup->rsv.values[type] >= num_bytes) {
|
|
|
|
qgroup->rsv.values[type] -= num_bytes;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
WARN_RATELIMIT(1,
|
|
|
|
"qgroup %llu %s reserved space underflow, have %llu to free %llu",
|
|
|
|
qgroup->qgroupid, qgroup_rsv_type_str(type),
|
|
|
|
qgroup->rsv.values[type], num_bytes);
|
|
|
|
#endif
|
|
|
|
qgroup->rsv.values[type] = 0;
|
|
|
|
}
|
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *dest,
|
|
|
|
struct btrfs_qgroup *src)
|
2017-12-12 15:34:24 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
|
2017-12-12 15:34:24 +08:00
|
|
|
}
|
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *dest,
|
2017-12-12 15:34:24 +08:00
|
|
|
struct btrfs_qgroup *src)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
|
2017-12-12 15:34:24 +08:00
|
|
|
}
|
|
|
|
|
2015-03-12 16:10:13 +08:00
|
|
|
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
|
|
|
|
int mod)
|
|
|
|
{
|
|
|
|
if (qg->old_refcnt < seq)
|
|
|
|
qg->old_refcnt = seq;
|
|
|
|
qg->old_refcnt += mod;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
|
|
|
|
int mod)
|
|
|
|
{
|
|
|
|
if (qg->new_refcnt < seq)
|
|
|
|
qg->new_refcnt = seq;
|
|
|
|
qg->new_refcnt += mod;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
|
|
|
|
{
|
|
|
|
if (qg->old_refcnt < seq)
|
|
|
|
return 0;
|
|
|
|
return qg->old_refcnt - seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
|
|
|
|
{
|
|
|
|
if (qg->new_refcnt < seq)
|
|
|
|
return 0;
|
|
|
|
return qg->new_refcnt - seq;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* glue structure to represent the relations between qgroups.
|
|
|
|
*/
|
|
|
|
struct btrfs_qgroup_list {
|
|
|
|
struct list_head next_group;
|
|
|
|
struct list_head next_member;
|
|
|
|
struct btrfs_qgroup *group;
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
};
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
|
|
|
|
{
|
|
|
|
return (u64)(uintptr_t)qg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
|
|
|
|
{
|
|
|
|
return (struct btrfs_qgroup *)(uintptr_t)n->aux;
|
|
|
|
}
|
2014-05-13 17:30:47 -07:00
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
static int
|
|
|
|
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
|
|
|
|
int init_flags);
|
|
|
|
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
|
2013-04-25 16:04:51 +00:00
|
|
|
|
2013-04-07 10:50:17 +00:00
|
|
|
/* must be called with qgroup_ioctl_lock held */
|
2012-06-28 18:03:02 +02:00
|
|
|
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct rb_node *n = fs_info->qgroup_tree.rb_node;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
if (qgroup->qgroupid < qgroupid)
|
|
|
|
n = n->rb_left;
|
|
|
|
else if (qgroup->qgroupid > qgroupid)
|
|
|
|
n = n->rb_right;
|
|
|
|
else
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
|
|
|
|
|
|
|
|
if (qgroup->qgroupid < qgroupid)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else if (qgroup->qgroupid > qgroupid)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
else
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
|
|
|
|
qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
|
|
|
|
if (!qgroup)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
qgroup->qgroupid = qgroupid;
|
|
|
|
INIT_LIST_HEAD(&qgroup->groups);
|
|
|
|
INIT_LIST_HEAD(&qgroup->members);
|
|
|
|
INIT_LIST_HEAD(&qgroup->dirty);
|
|
|
|
|
|
|
|
rb_link_node(&qgroup->node, parent, p);
|
|
|
|
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
|
|
|
|
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
|
2020-06-28 13:07:15 +08:00
|
|
|
static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
2020-06-28 13:07:15 +08:00
|
|
|
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
list_del(&qgroup->dirty);
|
|
|
|
while (!list_empty(&qgroup->groups)) {
|
|
|
|
list = list_first_entry(&qgroup->groups,
|
|
|
|
struct btrfs_qgroup_list, next_group);
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!list_empty(&qgroup->members)) {
|
|
|
|
list = list_first_entry(&qgroup->members,
|
|
|
|
struct btrfs_qgroup_list, next_member);
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
}
|
|
|
|
kfree(qgroup);
|
2013-08-14 09:13:36 +08:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-08-14 09:13:36 +08:00
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
|
|
|
|
if (!qgroup)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
|
2020-06-28 13:07:15 +08:00
|
|
|
__del_qgroup_rb(fs_info, qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int add_relation_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 memberid, u64 parentid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
|
|
|
member = find_qgroup_rb(fs_info, memberid);
|
|
|
|
parent = find_qgroup_rb(fs_info, parentid);
|
|
|
|
if (!member || !parent)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
list = kzalloc(sizeof(*list), GFP_ATOMIC);
|
|
|
|
if (!list)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
list->group = parent;
|
|
|
|
list->member = member;
|
|
|
|
list_add_tail(&list->next_group, &member->groups);
|
|
|
|
list_add_tail(&list->next_member, &parent->members);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int del_relation_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 memberid, u64 parentid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
|
|
|
member = find_qgroup_rb(fs_info, memberid);
|
|
|
|
parent = find_qgroup_rb(fs_info, parentid);
|
|
|
|
if (!member || !parent)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
|
|
|
if (list->group == parent) {
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
2014-05-07 17:06:09 -04:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
|
|
|
|
u64 rfer, u64 excl)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (!qgroup)
|
|
|
|
return -EINVAL;
|
|
|
|
if (qgroup->rfer != rfer || qgroup->excl != excl)
|
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* The full config is read in one go, only called from open_ctree()
|
|
|
|
* It doesn't use any locking, as at this point we're still single-threaded
|
|
|
|
*/
|
|
|
|
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
int slot;
|
|
|
|
int ret = 0;
|
|
|
|
u64 flags = 0;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
u64 rescan_progress = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2012-06-28 18:03:02 +02:00
|
|
|
return 0;
|
|
|
|
|
2017-02-13 12:10:20 +01:00
|
|
|
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
|
2013-05-06 11:03:27 +00:00
|
|
|
if (!fs_info->qgroup_ulist) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-06-28 13:07:15 +08:00
|
|
|
ret = btrfs_sysfs_add_qgroups(fs_info);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
/* default this to quota off, in case no status key is found */
|
|
|
|
fs_info->qgroup_flags = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pass 1: read status, all qgroup infos and limits
|
|
|
|
*/
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = 0;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
slot = path->slots[0];
|
|
|
|
l = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(l, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_status_item);
|
|
|
|
|
|
|
|
if (btrfs_qgroup_status_version(l, ptr) !=
|
|
|
|
BTRFS_QGROUP_STATUS_VERSION) {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"old qgroup version, quota disabled");
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (btrfs_qgroup_status_generation(l, ptr) !=
|
|
|
|
fs_info->generation) {
|
|
|
|
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 10:05:00 -04:00
|
|
|
"qgroup generation mismatch, marked as inconsistent");
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
|
|
|
|
ptr);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto next1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
|
|
|
|
found_key.type != BTRFS_QGROUP_LIMIT_KEY)
|
|
|
|
goto next1;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
|
|
|
|
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
|
2015-07-06 15:38:11 +02:00
|
|
|
btrfs_err(fs_info, "inconsistent qgroup config");
|
2012-06-28 18:03:02 +02:00
|
|
|
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
}
|
|
|
|
if (!qgroup) {
|
|
|
|
qgroup = add_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2020-06-28 13:07:15 +08:00
|
|
|
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
switch (found_key.type) {
|
|
|
|
case BTRFS_QGROUP_INFO_KEY: {
|
|
|
|
struct btrfs_qgroup_info_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_info_item);
|
|
|
|
qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
|
|
|
|
qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
|
|
|
|
qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
|
|
|
|
qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
|
|
|
|
/* generation currently unused */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case BTRFS_QGROUP_LIMIT_KEY: {
|
|
|
|
struct btrfs_qgroup_limit_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_limit_item);
|
|
|
|
qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
|
|
|
|
qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
|
|
|
|
qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
|
|
|
|
qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
|
|
|
|
qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
next1:
|
|
|
|
ret = btrfs_next_item(quota_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pass 2: read all qgroup relations
|
|
|
|
*/
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
while (1) {
|
|
|
|
slot = path->slots[0];
|
|
|
|
l = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(l, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
|
|
|
|
goto next2;
|
|
|
|
|
|
|
|
if (found_key.objectid > found_key.offset) {
|
|
|
|
/* parent <- member, not needed to build config */
|
|
|
|
/* FIXME should we omit the key completely? */
|
|
|
|
goto next2;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = add_relation_rb(fs_info, found_key.objectid,
|
|
|
|
found_key.offset);
|
2013-01-17 01:22:08 -07:00
|
|
|
if (ret == -ENOENT) {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"orphan qgroup relation 0x%llx->0x%llx",
|
2013-08-20 13:20:07 +02:00
|
|
|
found_key.objectid, found_key.offset);
|
2013-01-17 01:22:08 -07:00
|
|
|
ret = 0; /* ignore the error */
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
next2:
|
|
|
|
ret = btrfs_next_item(quota_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
fs_info->qgroup_flags |= flags;
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
|
|
|
|
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
|
|
|
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
|
|
|
|
ret >= 0)
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2013-05-28 15:47:23 +00:00
|
|
|
if (ret < 0) {
|
2013-05-06 11:03:27 +00:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-05-28 15:47:23 +00:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
2020-06-28 13:07:15 +08:00
|
|
|
btrfs_sysfs_del_qgroups(fs_info);
|
2013-05-28 15:47:23 +00:00
|
|
|
}
|
2013-05-06 11:03:27 +00:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret < 0 ? ret : 0;
|
|
|
|
}
|
|
|
|
|
2020-06-10 09:04:44 +08:00
|
|
|
/*
|
|
|
|
* Called in close_ctree() when quota is still enabled. This verifies we don't
|
|
|
|
* leak some reserved space.
|
|
|
|
*
|
|
|
|
* Return false if no reserved space is left.
|
|
|
|
* Return true if some reserved space is leaked.
|
|
|
|
*/
|
|
|
|
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct rb_node *node;
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
|
|
|
return ret;
|
|
|
|
/*
|
|
|
|
* Since we're unmounting, there is no race and no need to grab qgroup
|
|
|
|
* lock. And here we don't go post-order to provide a more user
|
|
|
|
* friendly sorted result.
|
|
|
|
*/
|
|
|
|
for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
qgroup = rb_entry(node, struct btrfs_qgroup, node);
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
|
|
|
|
if (qgroup->rsv.values[i]) {
|
|
|
|
ret = true;
|
|
|
|
btrfs_warn(fs_info,
|
2020-06-28 13:07:14 +08:00
|
|
|
"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
|
2020-06-10 09:04:44 +08:00
|
|
|
btrfs_qgroup_level(qgroup->qgroupid),
|
|
|
|
btrfs_qgroup_subvolid(qgroup->qgroupid),
|
|
|
|
i, qgroup->rsv.values[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
2013-08-14 09:13:37 +08:00
|
|
|
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
|
|
|
|
* first two are in single-threaded paths.And for the third one, we have set
|
|
|
|
* quota_root to be null with qgroup_lock held before, so it is safe to clean
|
|
|
|
* up the in-memory structures without qgroup_lock held.
|
2012-06-28 18:03:02 +02:00
|
|
|
*/
|
|
|
|
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct rb_node *n;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while ((n = rb_first(&fs_info->qgroup_tree))) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
rb_erase(n, &fs_info->qgroup_tree);
|
2020-06-28 13:07:15 +08:00
|
|
|
__del_qgroup_rb(fs_info, qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
2013-07-13 21:02:54 +08:00
|
|
|
/*
|
2018-11-28 12:05:13 +01:00
|
|
|
* We call btrfs_free_qgroup_config() when unmounting
|
2016-05-19 21:18:45 -04:00
|
|
|
* filesystem and disabling quota, so we set qgroup_ulist
|
2013-07-13 21:02:54 +08:00
|
|
|
* to be null here to avoid double free.
|
|
|
|
*/
|
2013-05-06 11:03:27 +00:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-07-13 21:02:54 +08:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
2020-06-28 13:07:15 +08:00
|
|
|
btrfs_sysfs_del_qgroups(fs_info);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:24 +08:00
|
|
|
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 14:45:24 +08:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = src;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = dst;
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:25 +08:00
|
|
|
static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 14:45:25 +08:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = src;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = dst;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int add_qgroup_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *quota_root, u64 qgroupid)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_qgroup_info_item *qgroup_info;
|
|
|
|
struct btrfs_qgroup_limit_item *qgroup_limit;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
2016-06-21 09:52:41 -04:00
|
|
|
if (btrfs_is_testing(quota_root->fs_info))
|
2014-05-07 17:06:09 -04:00
|
|
|
return 0;
|
2014-09-29 23:53:21 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroupid;
|
|
|
|
|
2014-08-18 14:01:17 -07:00
|
|
|
/*
|
|
|
|
* Avoid a transaction abort by catching -EEXIST here. In that
|
|
|
|
* case, we proceed by re-initializing the existing structure
|
|
|
|
* on disk.
|
|
|
|
*/
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*qgroup_info));
|
2014-08-18 14:01:17 -07:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_info_item);
|
|
|
|
btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
|
|
|
|
btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*qgroup_limit));
|
2014-08-18 14:01:17 -07:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_limit_item);
|
|
|
|
btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:26 +08:00
|
|
|
static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 14:45:26 +08:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroupid;
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
|
2014-11-20 21:01:41 -05:00
|
|
|
struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:27 +08:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_limit_item *qgroup_limit;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
2014-11-20 21:01:41 -05:00
|
|
|
key.offset = qgroup->qgroupid;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 11:20:56 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 14:45:27 +08:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2013-11-04 22:34:29 +01:00
|
|
|
qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
|
2014-11-20 21:01:41 -05:00
|
|
|
btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
|
|
|
|
btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
|
|
|
|
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
|
|
|
|
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
|
|
|
|
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_qgroup *qgroup)
|
|
|
|
{
|
2018-07-18 14:45:28 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_info_item *qgroup_info;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
2018-07-18 14:45:28 +08:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2014-05-07 17:06:09 -04:00
|
|
|
return 0;
|
2014-09-29 23:53:21 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroup->qgroupid;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 11:20:56 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 14:45:28 +08:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2013-11-04 22:34:29 +01:00
|
|
|
qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
|
|
|
|
btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
|
|
|
|
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
|
|
|
|
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
|
|
|
|
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:29 +08:00
|
|
|
static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:29 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_STATUS_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 11:20:56 +00:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 14:45:29 +08:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
|
|
|
|
btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
|
|
|
|
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
|
2013-04-25 16:04:51 +00:00
|
|
|
btrfs_set_qgroup_status_rescan(l, ptr,
|
|
|
|
fs_info->qgroup_rescan_progress.objectid);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* called with qgroup_lock held
|
|
|
|
*/
|
|
|
|
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2013-02-27 11:16:57 +00:00
|
|
|
struct extent_buffer *leaf = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret;
|
2013-02-27 11:16:57 +00:00
|
|
|
int nr = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-02-27 11:16:57 +00:00
|
|
|
path->leave_spinning = 1;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-02-27 11:16:57 +00:00
|
|
|
while (1) {
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
2013-02-27 11:16:57 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
nr = btrfs_header_nritems(leaf);
|
|
|
|
if (!nr)
|
2012-06-28 18:03:02 +02:00
|
|
|
break;
|
2013-02-27 11:16:57 +00:00
|
|
|
/*
|
|
|
|
* delete the leaf one by one
|
|
|
|
* since the whole tree is going
|
|
|
|
* to be deleted.
|
|
|
|
*/
|
|
|
|
path->slots[0] = 0;
|
|
|
|
ret = btrfs_del_items(trans, root, path, 0, nr);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2013-02-27 11:16:57 +00:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_release_path(path);
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-05 14:50:48 +03:00
|
|
|
int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
2013-04-07 10:24:57 +00:00
|
|
|
struct btrfs_root *tree_root = fs_info->tree_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key;
|
2013-04-07 10:24:57 +00:00
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_qgroup *qgroup = NULL;
|
2018-07-05 14:50:48 +03:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
2013-04-07 10:24:57 +00:00
|
|
|
int slot;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2018-01-31 10:52:04 +02:00
|
|
|
if (fs_info->quota_root)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
2018-12-19 19:47:37 +01:00
|
|
|
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
|
|
|
|
if (!fs_info->qgroup_ulist) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-06-28 13:07:15 +08:00
|
|
|
ret = btrfs_sysfs_add_qgroups(fs_info);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2018-07-05 14:50:48 +03:00
|
|
|
/*
|
|
|
|
* 1 for quota root item
|
|
|
|
* 1 for BTRFS_QGROUP_STATUS item
|
|
|
|
*
|
|
|
|
* Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
|
|
|
|
* per subvolume. However those are not currently reserved since it
|
|
|
|
* would be a lot of overkill.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(tree_root, 2);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* initially create the quota tree
|
|
|
|
*/
|
2019-03-20 13:20:49 +01:00
|
|
|
quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (IS_ERR(quota_root)) {
|
|
|
|
ret = PTR_ERR(quota_root);
|
2018-07-05 14:50:48 +03:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2012-10-16 05:44:21 +00:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2018-07-05 14:50:48 +03:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-10-16 05:44:21 +00:00
|
|
|
goto out_free_root;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_STATUS_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*ptr));
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-10-16 05:44:21 +00:00
|
|
|
goto out_free_path;
|
2018-07-05 14:50:48 +03:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ptr = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_status_item);
|
|
|
|
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
|
|
|
|
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
|
|
|
|
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
|
2013-04-25 16:04:51 +00:00
|
|
|
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
2013-04-07 10:24:57 +00:00
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
|
|
|
|
if (ret > 0)
|
|
|
|
goto out_add_root;
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
2018-07-05 14:50:48 +03:00
|
|
|
}
|
2013-04-07 10:24:57 +00:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
slot = path->slots[0];
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type == BTRFS_ROOT_REF_KEY) {
|
btrfs: drop the path before adding qgroup items when enabling qgroups
When enabling qgroups we walk the tree_root and then add a qgroup item
for every root that we have. This creates a lock dependency on the
tree_root and qgroup_root, which results in the following lockdep splat
(with tree locks using rwsem), eg. in tests btrfs/017 or btrfs/022:
======================================================
WARNING: possible circular locking dependency detected
5.9.0-default+ #1299 Not tainted
------------------------------------------------------
btrfs/24552 is trying to acquire lock:
ffff9142dfc5f630 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
but task is already holding lock:
ffff9142dfc5d0b0 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (btrfs-root-00){++++}-{3:3}:
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
down_read_nested+0x46/0x130
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_item+0x6e/0x140 [btrfs]
btrfs_create_tree+0x1cb/0x240 [btrfs]
btrfs_quota_enable+0xcd/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #0 (btrfs-quota-00){++++}-{3:3}:
check_prev_add+0x91/0xc30
validate_chain+0x491/0x750
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
down_read_nested+0x46/0x130
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_empty_items+0x58/0xa0 [btrfs]
add_qgroup_item.part.0+0x72/0x210 [btrfs]
btrfs_quota_enable+0x3bb/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(btrfs-root-00);
lock(btrfs-quota-00);
lock(btrfs-root-00);
lock(btrfs-quota-00);
*** DEADLOCK ***
5 locks held by btrfs/24552:
#0: ffff9142df431478 (sb_writers#10){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0xa0
#1: ffff9142f9b10cc0 (&fs_info->subvol_sem){++++}-{3:3}, at: btrfs_ioctl_quota_ctl+0x7b/0xe0 [btrfs]
#2: ffff9142f9b11a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0x790 [btrfs]
#3: ffff9142df431698 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x406/0x510 [btrfs]
#4: ffff9142dfc5d0b0 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
stack backtrace:
CPU: 1 PID: 24552 Comm: btrfs Not tainted 5.9.0-default+ #1299
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
Call Trace:
dump_stack+0x77/0x97
check_noncircular+0xf3/0x110
check_prev_add+0x91/0xc30
validate_chain+0x491/0x750
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
? lock_acquire+0xc4/0x140
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
down_read_nested+0x46/0x130
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
? btrfs_root_node+0xd9/0x200 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_empty_items+0x58/0xa0 [btrfs]
add_qgroup_item.part.0+0x72/0x210 [btrfs]
btrfs_quota_enable+0x3bb/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fix this by dropping the path whenever we find a root item, add the
qgroup item, and then re-lookup the root item we found and continue
processing roots.
Reported-by: David Sterba <dsterba@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-19 16:02:29 -04:00
|
|
|
|
|
|
|
/* Release locks on tree_root before we access quota_root */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2013-04-07 10:24:57 +00:00
|
|
|
ret = add_qgroup_item(trans, quota_root,
|
|
|
|
found_key.offset);
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
2018-07-05 14:50:48 +03:00
|
|
|
}
|
2013-04-07 10:24:57 +00:00
|
|
|
|
|
|
|
qgroup = add_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
2018-07-05 14:50:48 +03:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
2020-06-28 13:07:15 +08:00
|
|
|
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
|
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
btrfs: drop the path before adding qgroup items when enabling qgroups
When enabling qgroups we walk the tree_root and then add a qgroup item
for every root that we have. This creates a lock dependency on the
tree_root and qgroup_root, which results in the following lockdep splat
(with tree locks using rwsem), eg. in tests btrfs/017 or btrfs/022:
======================================================
WARNING: possible circular locking dependency detected
5.9.0-default+ #1299 Not tainted
------------------------------------------------------
btrfs/24552 is trying to acquire lock:
ffff9142dfc5f630 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
but task is already holding lock:
ffff9142dfc5d0b0 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (btrfs-root-00){++++}-{3:3}:
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
down_read_nested+0x46/0x130
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_item+0x6e/0x140 [btrfs]
btrfs_create_tree+0x1cb/0x240 [btrfs]
btrfs_quota_enable+0xcd/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #0 (btrfs-quota-00){++++}-{3:3}:
check_prev_add+0x91/0xc30
validate_chain+0x491/0x750
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
down_read_nested+0x46/0x130
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_empty_items+0x58/0xa0 [btrfs]
add_qgroup_item.part.0+0x72/0x210 [btrfs]
btrfs_quota_enable+0x3bb/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(btrfs-root-00);
lock(btrfs-quota-00);
lock(btrfs-root-00);
lock(btrfs-quota-00);
*** DEADLOCK ***
5 locks held by btrfs/24552:
#0: ffff9142df431478 (sb_writers#10){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0xa0
#1: ffff9142f9b10cc0 (&fs_info->subvol_sem){++++}-{3:3}, at: btrfs_ioctl_quota_ctl+0x7b/0xe0 [btrfs]
#2: ffff9142f9b11a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0x790 [btrfs]
#3: ffff9142df431698 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x406/0x510 [btrfs]
#4: ffff9142dfc5d0b0 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
stack backtrace:
CPU: 1 PID: 24552 Comm: btrfs Not tainted 5.9.0-default+ #1299
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
Call Trace:
dump_stack+0x77/0x97
check_noncircular+0xf3/0x110
check_prev_add+0x91/0xc30
validate_chain+0x491/0x750
__lock_acquire+0x3fb/0x730
lock_acquire.part.0+0x6a/0x130
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
? lock_acquire+0xc4/0x140
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
down_read_nested+0x46/0x130
? __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
__btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
? btrfs_root_node+0xd9/0x200 [btrfs]
__btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
btrfs_search_slot_get_root+0x11d/0x290 [btrfs]
btrfs_search_slot+0xc3/0x9f0 [btrfs]
btrfs_insert_empty_items+0x58/0xa0 [btrfs]
add_qgroup_item.part.0+0x72/0x210 [btrfs]
btrfs_quota_enable+0x3bb/0x790 [btrfs]
btrfs_ioctl_quota_ctl+0xc9/0xe0 [btrfs]
__x64_sys_ioctl+0x83/0xa0
do_syscall_64+0x2d/0x70
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fix this by dropping the path whenever we find a root item, add the
qgroup item, and then re-lookup the root item we found and continue
processing roots.
Reported-by: David Sterba <dsterba@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-19 16:02:29 -04:00
|
|
|
ret = btrfs_search_slot_for_read(tree_root, &found_key,
|
|
|
|
path, 1, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
|
|
|
/*
|
|
|
|
* Shouldn't happen, but in case it does we
|
|
|
|
* don't need to do the btrfs_next_item, just
|
|
|
|
* continue.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
2013-04-07 10:24:57 +00:00
|
|
|
}
|
|
|
|
ret = btrfs_next_item(tree_root, path);
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
2018-07-05 14:50:48 +03:00
|
|
|
}
|
2013-04-07 10:24:57 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_add_root:
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
2018-07-05 14:50:48 +03:00
|
|
|
}
|
2013-04-07 10:24:57 +00:00
|
|
|
|
|
|
|
qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
2018-07-05 14:50:48 +03:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 10:24:57 +00:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
2020-06-28 13:07:15 +08:00
|
|
|
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
|
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
2018-07-05 14:50:48 +03:00
|
|
|
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
2018-08-20 11:25:33 +03:00
|
|
|
trans = NULL;
|
|
|
|
if (ret)
|
2018-07-05 14:50:48 +03:00
|
|
|
goto out_free_path;
|
|
|
|
|
2018-11-19 14:15:36 +00:00
|
|
|
/*
|
|
|
|
* Set quota enabled flag after committing the transaction, to avoid
|
|
|
|
* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
|
|
|
|
* creation.
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
fs_info->quota_root = quota_root;
|
|
|
|
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2018-01-31 10:52:04 +02:00
|
|
|
ret = qgroup_rescan_init(fs_info, 0, 1);
|
|
|
|
if (!ret) {
|
|
|
|
qgroup_rescan_zero_tracking(fs_info);
|
2020-02-07 13:38:20 +08:00
|
|
|
fs_info->qgroup_rescan_running = true;
|
2018-01-31 10:52:04 +02:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
|
|
|
}
|
|
|
|
|
2012-10-16 05:44:21 +00:00
|
|
|
out_free_path:
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_free_path(path);
|
2012-10-16 05:44:21 +00:00
|
|
|
out_free_root:
|
2020-02-14 16:11:42 -05:00
|
|
|
if (ret)
|
2020-01-24 09:33:01 -05:00
|
|
|
btrfs_put_root(quota_root);
|
2012-10-16 05:44:21 +00:00
|
|
|
out:
|
2013-05-28 15:47:23 +00:00
|
|
|
if (ret) {
|
2013-05-06 11:03:27 +00:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-05-28 15:47:23 +00:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
2018-07-05 14:50:48 +03:00
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction(trans);
|
2020-06-28 13:07:15 +08:00
|
|
|
btrfs_sysfs_del_qgroups(fs_info);
|
2013-05-28 15:47:23 +00:00
|
|
|
}
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-05 14:50:48 +03:00
|
|
|
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
2018-07-05 14:50:48 +03:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2013-04-07 10:50:17 +00:00
|
|
|
if (!fs_info->quota_root)
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
2018-07-05 14:50:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* 1 For the root item
|
|
|
|
*
|
|
|
|
* We should also reserve enough items for the quota tree deletion in
|
|
|
|
* btrfs_clean_quota_tree but this is not done.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(fs_info->tree_root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-09-02 15:40:02 -04:00
|
|
|
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
2016-08-08 22:08:06 -04:00
|
|
|
btrfs_qgroup_wait_for_completion(fs_info, false);
|
2015-11-06 10:36:42 -08:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
|
|
|
fs_info->quota_root = NULL;
|
2015-02-27 16:24:26 +08:00
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2013-08-14 09:13:37 +08:00
|
|
|
btrfs_free_qgroup_config(fs_info);
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_clean_quota_tree(trans, quota_root);
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto end_trans;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-08-01 11:32:27 +08:00
|
|
|
ret = btrfs_del_root(trans, "a_root->root_key);
|
2018-07-05 14:50:48 +03:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto end_trans;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_del("a_root->dirty_list);
|
|
|
|
|
|
|
|
btrfs_tree_lock(quota_root->node);
|
2019-03-20 14:30:02 +01:00
|
|
|
btrfs_clean_tree_block(quota_root->node);
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_tree_unlock(quota_root->node);
|
|
|
|
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
|
|
|
|
|
2020-01-24 09:33:01 -05:00
|
|
|
btrfs_put_root(quota_root);
|
2018-07-05 14:50:48 +03:00
|
|
|
|
|
|
|
end_trans:
|
|
|
|
ret = btrfs_end_transaction(trans);
|
2012-06-28 18:03:02 +02:00
|
|
|
out:
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-04-25 16:04:51 +00:00
|
|
|
static void qgroup_dirty(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2013-04-25 16:04:51 +00:00
|
|
|
if (list_empty(&qgroup->dirty))
|
|
|
|
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2015-02-27 16:24:27 +08:00
|
|
|
/*
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
* The easy accounting, we're updating qgroup relationship whose child qgroup
|
|
|
|
* only has exclusive extents.
|
|
|
|
*
|
2018-11-28 12:05:13 +01:00
|
|
|
* In this case, all exclusive extents will also be exclusive for parent, so
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
* excl/rfer just get added/removed.
|
|
|
|
*
|
|
|
|
* So is qgroup reservation space, which should also be added/removed to
|
|
|
|
* parent.
|
|
|
|
* Or when child tries to release reservation space, parent will underflow its
|
|
|
|
* reservation (for relationship adding case).
|
2015-02-27 16:24:27 +08:00
|
|
|
*
|
|
|
|
* Caller should hold fs_info->qgroup_lock.
|
|
|
|
*/
|
|
|
|
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *tmp, u64 ref_root,
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
struct btrfs_qgroup *src, int sign)
|
2015-02-27 16:24:27 +08:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
u64 num_bytes = src->excl;
|
2015-02-27 16:24:27 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup->rfer += sign * num_bytes;
|
|
|
|
qgroup->rfer_cmpr += sign * num_bytes;
|
|
|
|
|
|
|
|
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
|
|
|
|
qgroup->excl += sign * num_bytes;
|
|
|
|
qgroup->excl_cmpr += sign * num_bytes;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
|
|
|
|
if (sign > 0)
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
else
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
|
2015-02-27 16:24:27 +08:00
|
|
|
|
|
|
|
qgroup_dirty(fs_info, qgroup);
|
|
|
|
|
|
|
|
/* Get all of the parent groups that contain this qgroup */
|
|
|
|
list_for_each_entry(glist, &qgroup->groups, next_group) {
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2015-02-27 16:24:27 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Iterate all of the parents and adjust their reference counts */
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(tmp, &uiter))) {
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup = unode_aux_to_qgroup(unode);
|
2015-02-27 16:24:27 +08:00
|
|
|
qgroup->rfer += sign * num_bytes;
|
|
|
|
qgroup->rfer_cmpr += sign * num_bytes;
|
|
|
|
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
|
|
|
|
qgroup->excl += sign * num_bytes;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
if (sign > 0)
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
else
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
|
2015-02-27 16:24:27 +08:00
|
|
|
qgroup->excl_cmpr += sign * num_bytes;
|
|
|
|
qgroup_dirty(fs_info, qgroup);
|
|
|
|
|
|
|
|
/* Add any parents of the parents */
|
|
|
|
list_for_each_entry(glist, &qgroup->groups, next_group) {
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2015-02-27 16:24:27 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Quick path for updating qgroup with only excl refs.
|
|
|
|
*
|
|
|
|
* In that case, just update all parent will be enough.
|
|
|
|
* Or we needs to do a full rescan.
|
|
|
|
* Caller should also hold fs_info->qgroup_lock.
|
|
|
|
*
|
|
|
|
* Return 0 for quick update, return >0 for need to full rescan
|
|
|
|
* and mark INCONSISTENT flag.
|
|
|
|
* Return < 0 for other error.
|
|
|
|
*/
|
|
|
|
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *tmp, u64 src, u64 dst,
|
|
|
|
int sign)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 1;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, src);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
if (qgroup->excl == qgroup->rfer) {
|
|
|
|
ret = 0;
|
|
|
|
err = __qgroup_excl_accounting(fs_info, tmp, dst,
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:26 +08:00
|
|
|
qgroup, sign);
|
2015-02-27 16:24:27 +08:00
|
|
|
if (err < 0) {
|
|
|
|
ret = err;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:30 +08:00
|
|
|
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:30 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-04-07 10:50:18 +00:00
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup *member;
|
2013-04-17 14:49:51 +00:00
|
|
|
struct btrfs_qgroup_list *list;
|
2015-02-27 16:24:27 +08:00
|
|
|
struct ulist *tmp;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2015-02-27 16:24:22 +08:00
|
|
|
/* Check the level of src and dst first */
|
|
|
|
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2017-02-13 12:41:02 +01:00
|
|
|
tmp = ulist_alloc(GFP_KERNEL);
|
2015-05-02 17:19:55 +02:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root) {
|
2019-11-25 21:58:51 -03:00
|
|
|
ret = -ENOTCONN;
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2013-04-07 10:50:18 +00:00
|
|
|
member = find_qgroup_rb(fs_info, src);
|
|
|
|
parent = find_qgroup_rb(fs_info, dst);
|
|
|
|
if (!member || !parent) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-17 14:49:51 +00:00
|
|
|
/* check if such qgroup relation exist firstly */
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
|
|
|
if (list->group == parent) {
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:24 +08:00
|
|
|
ret = add_qgroup_relation_item(trans, src, dst);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-07-18 14:45:24 +08:00
|
|
|
ret = add_qgroup_relation_item(trans, dst, src);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret) {
|
2018-07-18 14:45:25 +08:00
|
|
|
del_qgroup_relation_item(trans, src, dst);
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = add_relation_rb(fs_info, src, dst);
|
2015-02-27 16:24:27 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2013-04-07 10:50:16 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2015-02-27 16:24:27 +08:00
|
|
|
ulist_free(tmp);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:31 +08:00
|
|
|
static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:31 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-04-17 14:49:51 +00:00
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup_list *list;
|
2015-02-27 16:24:27 +08:00
|
|
|
struct ulist *tmp;
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
bool found = false;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
int ret2;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-02-13 12:41:02 +01:00
|
|
|
tmp = ulist_alloc(GFP_KERNEL);
|
2015-02-27 16:24:27 +08:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root) {
|
2019-11-25 21:58:51 -03:00
|
|
|
ret = -ENOTCONN;
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-17 14:49:51 +00:00
|
|
|
member = find_qgroup_rb(fs_info, src);
|
|
|
|
parent = find_qgroup_rb(fs_info, dst);
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
/*
|
|
|
|
* The parent/member pair doesn't exist, then try to delete the dead
|
|
|
|
* relation items only.
|
|
|
|
*/
|
|
|
|
if (!member || !parent)
|
|
|
|
goto delete_item;
|
2013-04-17 14:49:51 +00:00
|
|
|
|
|
|
|
/* check if such qgroup relation exist firstly */
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
if (list->group == parent) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
2013-04-17 14:49:51 +00:00
|
|
|
}
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
|
|
|
|
delete_item:
|
2018-07-18 14:45:25 +08:00
|
|
|
ret = del_qgroup_relation_item(trans, src, dst);
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
goto out;
|
|
|
|
ret2 = del_qgroup_relation_item(trans, dst, src);
|
|
|
|
if (ret2 < 0 && ret2 != -ENOENT)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
btrfs: qgroup: Try our best to delete qgroup relations
When we try to delete qgroups, we're pretty cautious, we make sure both
qgroups exist and there is a relationship between them, then try to
delete the relation.
This behavior is OK, but the problem is we need to two relation items,
and if we failed the first item deletion, we error out, leaving the
other relation item in qgroup tree.
Sometimes the error from del_qgroup_relation_item() could just be
-ENOENT, thus we can ignore that error and continue without any problem.
Further more, such cautious behavior makes qgroup relation deletion
impossible for orphan relation items.
This patch will enhance __del_qgroup_relation():
- If both qgroups and their relation items exist
Go the regular deletion routine and update their accounting if needed.
- If any qgroup or relation item doesn't exist
Then we still try to delete the orphan items anyway, but don't trigger
the accounting update.
By this, we try our best to remove relation items, and can handle orphan
relation items properly, while still keep the existing behavior for good
qgroup tree.
Reported-by: Andrei Borzenkov <arvidjaar@gmail.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-06 22:05:07 +08:00
|
|
|
/* At least one deletion succeeded, return 0 */
|
|
|
|
if (!ret || !ret2)
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
if (found) {
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
del_relation_rb(fs_info, src, dst);
|
|
|
|
ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
}
|
2013-04-07 10:50:16 +00:00
|
|
|
out:
|
2015-02-27 16:24:27 +08:00
|
|
|
ulist_free(tmp);
|
2014-11-24 10:27:09 -05:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:32 +08:00
|
|
|
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2014-11-24 10:27:09 -05:00
|
|
|
{
|
2018-07-18 14:45:32 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2014-11-24 10:27:09 -05:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2018-07-18 14:45:31 +08:00
|
|
|
ret = __del_qgroup_relation(trans, src, dst);
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2014-11-24 10:27:09 -05:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:33 +08:00
|
|
|
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:33 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root) {
|
2019-11-25 21:58:51 -03:00
|
|
|
ret = -ENOTCONN;
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2019-11-25 21:58:50 -03:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-17 14:49:51 +00:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (qgroup) {
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
ret = add_qgroup_item(trans, quota_root, qgroupid);
|
2013-04-17 14:49:51 +00:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
qgroup = add_qgroup_rb(fs_info, qgroupid);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2020-06-28 13:07:15 +08:00
|
|
|
if (IS_ERR(qgroup)) {
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = PTR_ERR(qgroup);
|
2020-06-28 13:07:15 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
|
2013-04-07 10:50:16 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:34 +08:00
|
|
|
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:34 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-01-17 01:22:09 -07:00
|
|
|
struct btrfs_qgroup *qgroup;
|
2014-11-24 10:27:09 -05:00
|
|
|
struct btrfs_qgroup_list *list;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root) {
|
2019-11-25 21:58:51 -03:00
|
|
|
ret = -ENOTCONN;
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-01-17 01:22:09 -07:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
2013-04-17 14:49:51 +00:00
|
|
|
if (!qgroup) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
2013-01-17 01:22:09 -07:00
|
|
|
}
|
2018-10-11 13:42:56 +08:00
|
|
|
|
|
|
|
/* Check if there are no children of this qgroup */
|
|
|
|
if (!list_empty(&qgroup->members)) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:26 +08:00
|
|
|
ret = del_qgroup_item(trans, qgroupid);
|
2017-09-17 09:02:29 +00:00
|
|
|
if (ret && ret != -ENOENT)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2014-11-24 10:27:09 -05:00
|
|
|
while (!list_empty(&qgroup->groups)) {
|
|
|
|
list = list_first_entry(&qgroup->groups,
|
|
|
|
struct btrfs_qgroup_list, next_group);
|
2018-07-18 14:45:31 +08:00
|
|
|
ret = __del_qgroup_relation(trans, qgroupid,
|
|
|
|
list->group->qgroupid);
|
2014-11-24 10:27:09 -05:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2016-06-22 18:54:23 -04:00
|
|
|
del_qgroup_rb(fs_info, qgroupid);
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2013-04-07 10:50:16 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:35 +08:00
|
|
|
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup_limit *limit)
|
|
|
|
{
|
2018-07-18 14:45:35 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 0;
|
2015-06-03 14:57:32 +08:00
|
|
|
/* Sometimes we would want to clear the limit on this qgroup.
|
|
|
|
* To meet this requirement, we treat the -1 as a special value
|
|
|
|
* which tell kernel to clear the limit on this qgroup.
|
|
|
|
*/
|
|
|
|
const u64 CLEAR_VALUE = -1;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root) {
|
2019-11-25 21:58:51 -03:00
|
|
|
ret = -ENOTCONN;
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 10:50:20 +00:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (!qgroup) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 10:50:17 +00:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2015-06-03 14:57:32 +08:00
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
|
|
|
|
if (limit->max_rfer == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
|
|
|
|
qgroup->max_rfer = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->max_rfer = limit->max_rfer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
|
|
|
|
if (limit->max_excl == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
|
|
|
|
qgroup->max_excl = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->max_excl = limit->max_excl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
|
|
|
|
if (limit->rsv_rfer == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
|
|
|
|
qgroup->rsv_rfer = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->rsv_rfer = limit->rsv_rfer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
|
|
|
|
if (limit->rsv_excl == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
|
|
|
|
qgroup->rsv_excl = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->rsv_excl = limit->rsv_excl;
|
|
|
|
}
|
|
|
|
}
|
2015-02-06 11:06:25 -05:00
|
|
|
qgroup->lim_flags |= limit->flags;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2014-11-20 21:01:41 -05:00
|
|
|
|
2018-07-18 14:45:27 +08:00
|
|
|
ret = update_qgroup_limit_item(trans, qgroup);
|
2014-11-20 21:01:41 -05:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_info(fs_info, "unable to update quota limit for %llu",
|
|
|
|
qgroupid);
|
|
|
|
}
|
|
|
|
|
2013-04-07 10:50:16 +00:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2014-07-17 12:39:01 -07:00
|
|
|
|
2016-10-18 09:31:27 +08:00
|
|
|
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_qgroup_extent_record *record)
|
2015-04-16 14:34:17 +08:00
|
|
|
{
|
|
|
|
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
|
|
|
|
struct rb_node *parent_node = NULL;
|
|
|
|
struct btrfs_qgroup_extent_record *entry;
|
|
|
|
u64 bytenr = record->bytenr;
|
|
|
|
|
2018-03-16 02:21:22 +01:00
|
|
|
lockdep_assert_held(&delayed_refs->lock);
|
2016-10-18 09:31:27 +08:00
|
|
|
trace_btrfs_qgroup_trace_extent(fs_info, record);
|
2015-11-05 14:38:00 -08:00
|
|
|
|
2015-04-16 14:34:17 +08:00
|
|
|
while (*p) {
|
|
|
|
parent_node = *p;
|
|
|
|
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
|
|
|
|
node);
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
if (bytenr < entry->bytenr) {
|
2015-04-16 14:34:17 +08:00
|
|
|
p = &(*p)->rb_left;
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
} else if (bytenr > entry->bytenr) {
|
2015-04-16 14:34:17 +08:00
|
|
|
p = &(*p)->rb_right;
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
} else {
|
|
|
|
if (record->data_rsv && !entry->data_rsv) {
|
|
|
|
entry->data_rsv = record->data_rsv;
|
|
|
|
entry->data_rsv_refroot =
|
|
|
|
record->data_rsv_refroot;
|
|
|
|
}
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
return 1;
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
}
|
2015-04-16 14:34:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&record->node, parent_node, p);
|
|
|
|
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-15 10:43:03 +08:00
|
|
|
int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup_extent_record *qrecord)
|
|
|
|
{
|
|
|
|
struct ulist *old_root;
|
|
|
|
u64 bytenr = qrecord->bytenr;
|
|
|
|
int ret;
|
|
|
|
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:45 -04:00
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
|
2018-01-29 15:53:01 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
|
|
|
|
ret);
|
|
|
|
return 0;
|
|
|
|
}
|
2017-02-15 10:43:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here we don't need to get the lock of
|
|
|
|
* trans->transaction->delayed_refs, since inserted qrecord won't
|
|
|
|
* be deleted, only qrecord->node may be modified (new qrecord insert)
|
|
|
|
*
|
|
|
|
* So modifying qrecord->old_roots is safe here
|
|
|
|
*/
|
|
|
|
qrecord->old_roots = old_root;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-18 16:28:03 +08:00
|
|
|
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
|
|
|
|
u64 num_bytes, gfp_t gfp_flag)
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
{
|
2018-07-18 16:28:03 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
struct btrfs_qgroup_extent_record *record;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
int ret;
|
|
|
|
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|
|
|
|
|| bytenr == 0 || num_bytes == 0)
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
return 0;
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
record = kzalloc(sizeof(*record), gfp_flag);
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
if (!record)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
record->bytenr = bytenr;
|
|
|
|
record->num_bytes = num_bytes;
|
|
|
|
record->old_roots = NULL;
|
|
|
|
|
|
|
|
spin_lock(&delayed_refs->lock);
|
2016-06-22 18:54:24 -04:00
|
|
|
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
2017-02-15 10:43:03 +08:00
|
|
|
if (ret > 0) {
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 10:36:50 +08:00
|
|
|
kfree(record);
|
2017-02-15 10:43:03 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return btrfs_qgroup_trace_extent_post(fs_info, record);
|
2015-04-16 14:34:17 +08:00
|
|
|
}
|
|
|
|
|
2016-10-18 09:31:28 +08:00
|
|
|
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *eb)
|
|
|
|
{
|
2018-07-18 14:45:37 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2016-10-18 09:31:28 +08:00
|
|
|
int nr = btrfs_header_nritems(eb);
|
|
|
|
int i, extent_type, ret;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u64 bytenr, num_bytes;
|
|
|
|
|
|
|
|
/* We can be called directly from walk_up_proc() */
|
2016-06-22 18:54:23 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2016-10-18 09:31:28 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, i);
|
|
|
|
|
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
|
|
|
|
/* filter out non qgroup-accountable extents */
|
|
|
|
extent_type = btrfs_file_extent_type(eb, fi);
|
|
|
|
|
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
|
|
|
|
if (!bytenr)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
|
|
|
|
|
2018-07-18 16:28:03 +08:00
|
|
|
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
|
|
|
|
GFP_NOFS);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2017-06-20 08:15:26 -04:00
|
|
|
cond_resched();
|
2016-10-18 09:31:28 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk up the tree from the bottom, freeing leaves and any interior
|
|
|
|
* nodes which have had all slots visited. If a node (leaf or
|
|
|
|
* interior) is freed, the node above it will have it's slot
|
|
|
|
* incremented. The root node will never be freed.
|
|
|
|
*
|
|
|
|
* At the end of this function, we should have a path which has all
|
|
|
|
* slots incremented to the next position for a search. If we need to
|
|
|
|
* read a new node it will be NULL and the node above it will have the
|
|
|
|
* correct slot selected for a later read.
|
|
|
|
*
|
|
|
|
* If we increment the root nodes slot counter past the number of
|
|
|
|
* elements, 1 is returned to signal completion of the search.
|
|
|
|
*/
|
2017-02-10 20:30:23 +01:00
|
|
|
static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
|
2016-10-18 09:31:28 +08:00
|
|
|
{
|
|
|
|
int level = 0;
|
|
|
|
int nr, slot;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
|
|
|
|
if (root_level == 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
while (level <= root_level) {
|
|
|
|
eb = path->nodes[level];
|
|
|
|
nr = btrfs_header_nritems(eb);
|
|
|
|
path->slots[level]++;
|
|
|
|
slot = path->slots[level];
|
|
|
|
if (slot >= nr || level == 0) {
|
|
|
|
/*
|
|
|
|
* Don't free the root - we will detect this
|
|
|
|
* condition after our loop and return a
|
|
|
|
* positive value for caller to stop walking the tree.
|
|
|
|
*/
|
|
|
|
if (level != root_level) {
|
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
|
|
|
path->locks[level] = 0;
|
|
|
|
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
path->nodes[level] = NULL;
|
|
|
|
path->slots[level] = 0;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We have a valid slot to walk back down
|
|
|
|
* from. Stop here so caller can process these
|
|
|
|
* new nodes.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
level++;
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[root_level];
|
|
|
|
if (path->slots[root_level] >= btrfs_header_nritems(eb))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-09-27 14:42:30 +08:00
|
|
|
/*
|
|
|
|
* Helper function to trace a subtree tree block swap.
|
|
|
|
*
|
|
|
|
* The swap will happen in highest tree block, but there may be a lot of
|
|
|
|
* tree blocks involved.
|
|
|
|
*
|
|
|
|
* For example:
|
|
|
|
* OO = Old tree blocks
|
|
|
|
* NN = New tree blocks allocated during balance
|
|
|
|
*
|
|
|
|
* File tree (257) Reloc tree for 257
|
|
|
|
* L2 OO NN
|
|
|
|
* / \ / \
|
|
|
|
* L1 OO OO (a) OO NN (a)
|
|
|
|
* / \ / \ / \ / \
|
|
|
|
* L0 OO OO OO OO OO OO NN NN
|
|
|
|
* (b) (c) (b) (c)
|
|
|
|
*
|
|
|
|
* When calling qgroup_trace_extent_swap(), we will pass:
|
|
|
|
* @src_eb = OO(a)
|
|
|
|
* @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
|
|
|
|
* @dst_level = 0
|
|
|
|
* @root_level = 1
|
|
|
|
*
|
|
|
|
* In that case, qgroup_trace_extent_swap() will search from OO(a) to
|
|
|
|
* reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
|
|
|
|
*
|
|
|
|
* The main work of qgroup_trace_extent_swap() can be split into 3 parts:
|
|
|
|
*
|
|
|
|
* 1) Tree search from @src_eb
|
|
|
|
* It should acts as a simplified btrfs_search_slot().
|
|
|
|
* The key for search can be extracted from @dst_path->nodes[dst_level]
|
|
|
|
* (first key).
|
|
|
|
*
|
|
|
|
* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
|
|
|
|
* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
|
2018-11-28 12:05:13 +01:00
|
|
|
* They should be marked during previous (@dst_level = 1) iteration.
|
2018-09-27 14:42:30 +08:00
|
|
|
*
|
|
|
|
* 3) Mark file extents in leaves dirty
|
|
|
|
* We don't have good way to pick out new file extents only.
|
|
|
|
* So we still follow the old method by scanning all file extents in
|
|
|
|
* the leave.
|
|
|
|
*
|
2018-11-28 12:05:13 +01:00
|
|
|
* This function can free us from keeping two paths, thus later we only need
|
2018-09-27 14:42:30 +08:00
|
|
|
* to care about how to iterate all new tree blocks in reloc tree.
|
|
|
|
*/
|
|
|
|
static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
|
|
|
|
struct extent_buffer *src_eb,
|
|
|
|
struct btrfs_path *dst_path,
|
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 14:42:35 +08:00
|
|
|
int dst_level, int root_level,
|
|
|
|
bool trace_leaf)
|
2018-09-27 14:42:30 +08:00
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *src_path;
|
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
u32 nodesize = fs_info->nodesize;
|
|
|
|
int cur_level = root_level;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(dst_level > root_level);
|
|
|
|
/* Level mismatch */
|
|
|
|
if (btrfs_header_level(src_eb) != root_level)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
src_path = btrfs_alloc_path();
|
|
|
|
if (!src_path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dst_level)
|
|
|
|
btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
|
|
|
|
else
|
|
|
|
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
|
|
|
|
|
|
|
|
/* For src_path */
|
2019-10-08 13:28:47 +02:00
|
|
|
atomic_inc(&src_eb->refs);
|
2018-09-27 14:42:30 +08:00
|
|
|
src_path->nodes[root_level] = src_eb;
|
|
|
|
src_path->slots[root_level] = dst_path->slots[root_level];
|
|
|
|
src_path->locks[root_level] = 0;
|
|
|
|
|
|
|
|
/* A simplified version of btrfs_search_slot() */
|
|
|
|
while (cur_level >= dst_level) {
|
|
|
|
struct btrfs_key src_key;
|
|
|
|
struct btrfs_key dst_key;
|
|
|
|
|
|
|
|
if (src_path->nodes[cur_level] == NULL) {
|
|
|
|
struct btrfs_key first_key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int parent_slot;
|
|
|
|
u64 child_gen;
|
|
|
|
u64 child_bytenr;
|
|
|
|
|
|
|
|
eb = src_path->nodes[cur_level + 1];
|
|
|
|
parent_slot = src_path->slots[cur_level + 1];
|
|
|
|
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
|
|
|
|
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
|
|
|
|
btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
|
|
|
|
|
|
|
|
eb = read_tree_block(fs_info, child_bytenr, child_gen,
|
|
|
|
cur_level, &first_key);
|
|
|
|
if (IS_ERR(eb)) {
|
|
|
|
ret = PTR_ERR(eb);
|
|
|
|
goto out;
|
|
|
|
} else if (!extent_buffer_uptodate(eb)) {
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_path->nodes[cur_level] = eb;
|
|
|
|
|
|
|
|
btrfs_tree_read_lock(eb);
|
2018-04-04 02:00:17 +02:00
|
|
|
btrfs_set_lock_blocking_read(eb);
|
2018-09-27 14:42:30 +08:00
|
|
|
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_path->slots[cur_level] = dst_path->slots[cur_level];
|
|
|
|
if (cur_level) {
|
|
|
|
btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
|
|
|
|
&dst_key, dst_path->slots[cur_level]);
|
|
|
|
btrfs_node_key_to_cpu(src_path->nodes[cur_level],
|
|
|
|
&src_key, src_path->slots[cur_level]);
|
|
|
|
} else {
|
|
|
|
btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
|
|
|
|
&dst_key, dst_path->slots[cur_level]);
|
|
|
|
btrfs_item_key_to_cpu(src_path->nodes[cur_level],
|
|
|
|
&src_key, src_path->slots[cur_level]);
|
|
|
|
}
|
|
|
|
/* Content mismatch, something went wrong */
|
|
|
|
if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
cur_level--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now both @dst_path and @src_path have been populated, record the tree
|
|
|
|
* blocks for qgroup accounting.
|
|
|
|
*/
|
|
|
|
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
|
|
|
|
nodesize, GFP_NOFS);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = btrfs_qgroup_trace_extent(trans,
|
|
|
|
dst_path->nodes[dst_level]->start,
|
|
|
|
nodesize, GFP_NOFS);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Record leaf file extents */
|
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 14:42:35 +08:00
|
|
|
if (dst_level == 0 && trace_leaf) {
|
2018-09-27 14:42:30 +08:00
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_free_path(src_path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-09-27 14:42:31 +08:00
|
|
|
/*
|
|
|
|
* Helper function to do recursive generation-aware depth-first search, to
|
|
|
|
* locate all new tree blocks in a subtree of reloc tree.
|
|
|
|
*
|
|
|
|
* E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
|
|
|
|
* reloc tree
|
|
|
|
* L2 NN (a)
|
|
|
|
* / \
|
|
|
|
* L1 OO NN (b)
|
|
|
|
* / \ / \
|
|
|
|
* L0 OO OO OO NN
|
|
|
|
* (c) (d)
|
|
|
|
* If we pass:
|
|
|
|
* @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
|
|
|
|
* @cur_level = 1
|
|
|
|
* @root_level = 1
|
|
|
|
*
|
|
|
|
* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
|
|
|
|
* above tree blocks along with their counter parts in file tree.
|
2018-11-28 12:05:13 +01:00
|
|
|
* While during search, old tree blocks OO(c) will be skipped as tree block swap
|
2018-09-27 14:42:31 +08:00
|
|
|
* won't affect OO(c).
|
|
|
|
*/
|
|
|
|
static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
|
|
|
|
struct extent_buffer *src_eb,
|
|
|
|
struct btrfs_path *dst_path,
|
|
|
|
int cur_level, int root_level,
|
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 14:42:35 +08:00
|
|
|
u64 last_snapshot, bool trace_leaf)
|
2018-09-27 14:42:31 +08:00
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
bool need_cleanup = false;
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Level sanity check */
|
2019-03-18 17:45:19 +02:00
|
|
|
if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
|
|
|
|
root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
|
2018-09-27 14:42:31 +08:00
|
|
|
root_level < cur_level) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"%s: bad levels, cur_level=%d root_level=%d",
|
|
|
|
__func__, cur_level, root_level);
|
|
|
|
return -EUCLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Read the tree block if needed */
|
|
|
|
if (dst_path->nodes[cur_level] == NULL) {
|
|
|
|
struct btrfs_key first_key;
|
|
|
|
int parent_slot;
|
|
|
|
u64 child_gen;
|
|
|
|
u64 child_bytenr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dst_path->nodes[root_level] must be initialized before
|
|
|
|
* calling this function.
|
|
|
|
*/
|
|
|
|
if (cur_level == root_level) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
|
|
|
|
__func__, root_level, root_level, cur_level);
|
|
|
|
return -EUCLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to get child blockptr/gen from parent before we can
|
|
|
|
* read it.
|
|
|
|
*/
|
|
|
|
eb = dst_path->nodes[cur_level + 1];
|
|
|
|
parent_slot = dst_path->slots[cur_level + 1];
|
|
|
|
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
|
|
|
|
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
|
|
|
|
btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
|
|
|
|
|
|
|
|
/* This node is old, no need to trace */
|
|
|
|
if (child_gen < last_snapshot)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
eb = read_tree_block(fs_info, child_bytenr, child_gen,
|
|
|
|
cur_level, &first_key);
|
|
|
|
if (IS_ERR(eb)) {
|
|
|
|
ret = PTR_ERR(eb);
|
|
|
|
goto out;
|
|
|
|
} else if (!extent_buffer_uptodate(eb)) {
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst_path->nodes[cur_level] = eb;
|
|
|
|
dst_path->slots[cur_level] = 0;
|
|
|
|
|
|
|
|
btrfs_tree_read_lock(eb);
|
2018-04-04 02:00:17 +02:00
|
|
|
btrfs_set_lock_blocking_read(eb);
|
2018-09-27 14:42:31 +08:00
|
|
|
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
|
|
|
|
need_cleanup = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now record this tree block and its counter part for qgroups */
|
|
|
|
ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
|
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 14:42:35 +08:00
|
|
|
root_level, trace_leaf);
|
2018-09-27 14:42:31 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
eb = dst_path->nodes[cur_level];
|
|
|
|
|
|
|
|
if (cur_level > 0) {
|
|
|
|
/* Iterate all child tree blocks */
|
|
|
|
for (i = 0; i < btrfs_header_nritems(eb); i++) {
|
|
|
|
/* Skip old tree blocks as they won't be swapped */
|
|
|
|
if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
|
|
|
|
continue;
|
|
|
|
dst_path->slots[cur_level] = i;
|
|
|
|
|
|
|
|
/* Recursive call (at most 7 times) */
|
|
|
|
ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
|
|
|
|
dst_path, cur_level - 1, root_level,
|
btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group
For qgroup_trace_extent_swap(), if we find one leaf that needs to be
traced, we will also iterate all file extents and trace them.
This is OK if we're relocating data block groups, but if we're
relocating metadata block groups, balance code itself has ensured that
both subtree of file tree and reloc tree contain the same contents.
That's to say, if we're relocating metadata block groups, all file
extents in reloc and file tree should match, thus no need to trace them.
This should reduce the total number of dirty extents processed in metadata
block group balance.
[[Benchmark]] (with all previous enhancement)
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
| v4.19-rc1 | w/ patchset | diff (*)
---------------------------------------------------------------
relocated extents | 22929 | 22851 | -0.3%
qgroup dirty extents | 227757 | 140886 | -38.1%
time (sys) | 65.253s | 37.464s | -42.6%
time (real) | 74.032s | 44.722s | -39.6%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-27 14:42:35 +08:00
|
|
|
last_snapshot, trace_leaf);
|
2018-09-27 14:42:31 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if (need_cleanup) {
|
|
|
|
/* Clean up */
|
|
|
|
btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
|
|
|
|
dst_path->locks[cur_level]);
|
|
|
|
free_extent_buffer(dst_path->nodes[cur_level]);
|
|
|
|
dst_path->nodes[cur_level] = NULL;
|
|
|
|
dst_path->slots[cur_level] = 0;
|
|
|
|
dst_path->locks[cur_level] = 0;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-01-23 15:15:15 +08:00
|
|
|
static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *src_eb,
|
|
|
|
struct extent_buffer *dst_eb,
|
|
|
|
u64 last_snapshot, bool trace_leaf)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_path *dst_path = NULL;
|
|
|
|
int level;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Wrong parameter order */
|
|
|
|
if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
|
|
|
|
btrfs_header_generation(src_eb),
|
|
|
|
btrfs_header_generation(dst_eb));
|
|
|
|
return -EUCLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
level = btrfs_header_level(dst_eb);
|
|
|
|
dst_path = btrfs_alloc_path();
|
|
|
|
if (!dst_path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* For dst_path */
|
2019-10-08 13:28:47 +02:00
|
|
|
atomic_inc(&dst_eb->refs);
|
2019-01-23 15:15:15 +08:00
|
|
|
dst_path->nodes[level] = dst_eb;
|
|
|
|
dst_path->slots[level] = 0;
|
|
|
|
dst_path->locks[level] = 0;
|
|
|
|
|
|
|
|
/* Do the generation aware breadth-first search */
|
|
|
|
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
|
|
|
|
level, last_snapshot, trace_leaf);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(dst_path);
|
|
|
|
if (ret < 0)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-10-18 09:31:28 +08:00
|
|
|
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *root_eb,
|
|
|
|
u64 root_gen, int root_level)
|
|
|
|
{
|
2018-07-18 14:45:38 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2016-10-18 09:31:28 +08:00
|
|
|
int ret = 0;
|
|
|
|
int level;
|
|
|
|
struct extent_buffer *eb = root_eb;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
|
2017-07-12 09:42:19 +03:00
|
|
|
BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
|
2016-10-18 09:31:28 +08:00
|
|
|
BUG_ON(root_eb == NULL);
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2016-10-18 09:31:28 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!extent_buffer_uptodate(root_eb)) {
|
2018-03-29 09:08:11 +08:00
|
|
|
ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (root_level == 0) {
|
2018-07-18 14:45:37 +08:00
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
|
2016-10-18 09:31:28 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk down the tree. Missing extent blocks are filled in as
|
|
|
|
* we go. Metadata is accounted every time we read a new
|
|
|
|
* extent block.
|
|
|
|
*
|
|
|
|
* When we reach a leaf, we account for file extent items in it,
|
|
|
|
* walk back up the tree (adjusting slot pointers as we go)
|
|
|
|
* and restart the search process.
|
|
|
|
*/
|
2019-10-08 13:28:47 +02:00
|
|
|
atomic_inc(&root_eb->refs); /* For path */
|
2016-10-18 09:31:28 +08:00
|
|
|
path->nodes[root_level] = root_eb;
|
|
|
|
path->slots[root_level] = 0;
|
|
|
|
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
|
|
|
|
walk_down:
|
|
|
|
level = root_level;
|
|
|
|
while (level >= 0) {
|
|
|
|
if (path->nodes[level] == NULL) {
|
2018-03-29 09:08:11 +08:00
|
|
|
struct btrfs_key first_key;
|
2016-10-18 09:31:28 +08:00
|
|
|
int parent_slot;
|
|
|
|
u64 child_gen;
|
|
|
|
u64 child_bytenr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to get child blockptr/gen from parent before
|
|
|
|
* we can read it.
|
|
|
|
*/
|
|
|
|
eb = path->nodes[level + 1];
|
|
|
|
parent_slot = path->slots[level + 1];
|
|
|
|
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
|
|
|
|
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
|
2018-03-29 09:08:11 +08:00
|
|
|
btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
|
2016-10-18 09:31:28 +08:00
|
|
|
|
2018-03-29 09:08:11 +08:00
|
|
|
eb = read_tree_block(fs_info, child_bytenr, child_gen,
|
|
|
|
level, &first_key);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (IS_ERR(eb)) {
|
|
|
|
ret = PTR_ERR(eb);
|
|
|
|
goto out;
|
|
|
|
} else if (!extent_buffer_uptodate(eb)) {
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->nodes[level] = eb;
|
|
|
|
path->slots[level] = 0;
|
|
|
|
|
|
|
|
btrfs_tree_read_lock(eb);
|
2018-04-04 02:00:17 +02:00
|
|
|
btrfs_set_lock_blocking_read(eb);
|
2016-10-18 09:31:28 +08:00
|
|
|
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
|
|
|
|
|
2018-07-18 16:28:03 +08:00
|
|
|
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
|
2016-06-22 18:54:23 -04:00
|
|
|
fs_info->nodesize,
|
|
|
|
GFP_NOFS);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (level == 0) {
|
2018-07-18 14:45:37 +08:00
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans,
|
|
|
|
path->nodes[level]);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Nonzero return here means we completed our search */
|
2017-02-10 20:30:23 +01:00
|
|
|
ret = adjust_slots_upwards(path, root_level);
|
2016-10-18 09:31:28 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Restart search with new slots */
|
|
|
|
goto walk_down;
|
|
|
|
}
|
|
|
|
|
|
|
|
level--;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-04-12 16:52:34 +08:00
|
|
|
#define UPDATE_NEW 0
|
|
|
|
#define UPDATE_OLD 1
|
|
|
|
/*
|
|
|
|
* Walk all of the roots that points to the bytenr and adjust their refcnts.
|
|
|
|
*/
|
|
|
|
static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *roots, struct ulist *tmp,
|
|
|
|
struct ulist *qgroups, u64 seq, int update_old)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct ulist_node *tmp_unode;
|
|
|
|
struct ulist_iterator tmp_uiter;
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!roots)
|
|
|
|
return 0;
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(roots, &uiter))) {
|
|
|
|
qg = find_qgroup_rb(fs_info, unode->val);
|
|
|
|
if (!qg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ulist_reinit(tmp);
|
2016-10-26 16:23:50 +02:00
|
|
|
ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
|
2015-04-12 16:52:34 +08:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2016-10-26 16:23:50 +02:00
|
|
|
ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
|
2015-04-12 16:52:34 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
ULIST_ITER_INIT(&tmp_uiter);
|
|
|
|
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(tmp_unode);
|
2015-04-12 16:52:34 +08:00
|
|
|
if (update_old)
|
|
|
|
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
|
|
|
|
else
|
|
|
|
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
|
|
|
ret = ulist_add(qgroups, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group),
|
2015-04-12 16:52:34 +08:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group),
|
2015-04-12 16:52:34 +08:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-04-12 16:59:57 +08:00
|
|
|
/*
|
|
|
|
* Update qgroup rfer/excl counters.
|
|
|
|
* Rfer update is easy, codes can explain themselves.
|
2015-04-17 10:23:16 +08:00
|
|
|
*
|
2020-08-04 19:48:34 -07:00
|
|
|
* Excl update is tricky, the update is split into 2 parts.
|
2015-04-12 16:59:57 +08:00
|
|
|
* Part 1: Possible exclusive <-> sharing detect:
|
|
|
|
* | A | !A |
|
|
|
|
* -------------------------------------
|
|
|
|
* B | * | - |
|
|
|
|
* -------------------------------------
|
|
|
|
* !B | + | ** |
|
|
|
|
* -------------------------------------
|
|
|
|
*
|
|
|
|
* Conditions:
|
|
|
|
* A: cur_old_roots < nr_old_roots (not exclusive before)
|
|
|
|
* !A: cur_old_roots == nr_old_roots (possible exclusive before)
|
|
|
|
* B: cur_new_roots < nr_new_roots (not exclusive now)
|
2016-05-19 21:18:45 -04:00
|
|
|
* !B: cur_new_roots == nr_new_roots (possible exclusive now)
|
2015-04-12 16:59:57 +08:00
|
|
|
*
|
|
|
|
* Results:
|
|
|
|
* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
|
|
|
|
* *: Definitely not changed. **: Possible unchanged.
|
|
|
|
*
|
|
|
|
* For !A and !B condition, the exception is cur_old/new_roots == 0 case.
|
|
|
|
*
|
|
|
|
* To make the logic clear, we first use condition A and B to split
|
|
|
|
* combination into 4 results.
|
|
|
|
*
|
|
|
|
* Then, for result "+" and "-", check old/new_roots == 0 case, as in them
|
|
|
|
* only on variant maybe 0.
|
|
|
|
*
|
|
|
|
* Lastly, check result **, since there are 2 variants maybe 0, split them
|
|
|
|
* again(2x2).
|
|
|
|
* But this time we don't need to consider other things, the codes and logic
|
|
|
|
* is easy to understand now.
|
|
|
|
*/
|
|
|
|
static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *qgroups,
|
|
|
|
u64 nr_old_roots,
|
|
|
|
u64 nr_new_roots,
|
|
|
|
u64 num_bytes, u64 seq)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
u64 cur_new_count, cur_old_count;
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(qgroups, &uiter))) {
|
|
|
|
bool dirty = false;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2015-04-12 16:59:57 +08:00
|
|
|
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
|
|
|
|
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
|
|
|
|
|
2018-04-30 15:04:44 +08:00
|
|
|
trace_qgroup_update_counters(fs_info, qg, cur_old_count,
|
|
|
|
cur_new_count);
|
2016-03-29 17:19:55 -07:00
|
|
|
|
2015-04-12 16:59:57 +08:00
|
|
|
/* Rfer update part */
|
|
|
|
if (cur_old_count == 0 && cur_new_count > 0) {
|
|
|
|
qg->rfer += num_bytes;
|
|
|
|
qg->rfer_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
if (cur_old_count > 0 && cur_new_count == 0) {
|
|
|
|
qg->rfer -= num_bytes;
|
|
|
|
qg->rfer_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Excl update part */
|
|
|
|
/* Exclusive/none -> shared case */
|
|
|
|
if (cur_old_count == nr_old_roots &&
|
|
|
|
cur_new_count < nr_new_roots) {
|
|
|
|
/* Exclusive -> shared */
|
|
|
|
if (cur_old_count != 0) {
|
|
|
|
qg->excl -= num_bytes;
|
|
|
|
qg->excl_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Shared -> exclusive/none case */
|
|
|
|
if (cur_old_count < nr_old_roots &&
|
|
|
|
cur_new_count == nr_new_roots) {
|
|
|
|
/* Shared->exclusive */
|
|
|
|
if (cur_new_count != 0) {
|
|
|
|
qg->excl += num_bytes;
|
|
|
|
qg->excl_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Exclusive/none -> exclusive/none case */
|
|
|
|
if (cur_old_count == nr_old_roots &&
|
|
|
|
cur_new_count == nr_new_roots) {
|
|
|
|
if (cur_old_count == 0) {
|
|
|
|
/* None -> exclusive/none */
|
|
|
|
|
|
|
|
if (cur_new_count != 0) {
|
|
|
|
/* None -> exclusive */
|
|
|
|
qg->excl += num_bytes;
|
|
|
|
qg->excl_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
/* None -> none, nothing changed */
|
|
|
|
} else {
|
|
|
|
/* Exclusive -> exclusive/none */
|
|
|
|
|
|
|
|
if (cur_new_count == 0) {
|
|
|
|
/* Exclusive -> none */
|
|
|
|
qg->excl -= num_bytes;
|
|
|
|
qg->excl_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
/* Exclusive -> exclusive, nothing changed */
|
|
|
|
}
|
|
|
|
}
|
2015-08-03 14:44:29 +08:00
|
|
|
|
2015-04-12 16:59:57 +08:00
|
|
|
if (dirty)
|
|
|
|
qgroup_dirty(fs_info, qg);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-27 15:10:34 +08:00
|
|
|
/*
|
|
|
|
* Check if the @roots potentially is a list of fs tree roots
|
|
|
|
*
|
|
|
|
* Return 0 for definitely not a fs/subvol tree roots ulist
|
|
|
|
* Return 1 for possible fs/subvol tree roots in the list (considering an empty
|
|
|
|
* one as well)
|
|
|
|
*/
|
|
|
|
static int maybe_fs_roots(struct ulist *roots)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
|
|
|
|
/* Empty one, still possible for fs roots */
|
|
|
|
if (!roots || roots->nnodes == 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
unode = ulist_next(roots, &uiter);
|
|
|
|
if (!unode)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it contains fs tree roots, then it must belong to fs/subvol
|
|
|
|
* trees.
|
|
|
|
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
|
|
|
|
*/
|
|
|
|
return is_fstree(unode->val);
|
|
|
|
}
|
|
|
|
|
2018-07-18 14:45:39 +08:00
|
|
|
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
|
|
|
|
u64 num_bytes, struct ulist *old_roots,
|
|
|
|
struct ulist *new_roots)
|
2015-04-16 15:37:33 +08:00
|
|
|
{
|
2018-07-18 14:45:39 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-04-16 15:37:33 +08:00
|
|
|
struct ulist *qgroups = NULL;
|
|
|
|
struct ulist *tmp = NULL;
|
|
|
|
u64 seq;
|
|
|
|
u64 nr_new_roots = 0;
|
|
|
|
u64 nr_old_roots = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
2020-01-08 21:07:32 +09:00
|
|
|
/*
|
|
|
|
* If quotas get disabled meanwhile, the resouces need to be freed and
|
|
|
|
* we can't just exit here.
|
|
|
|
*/
|
2017-02-13 14:05:24 +01:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2020-01-08 21:07:32 +09:00
|
|
|
goto out_free;
|
2017-02-13 14:05:24 +01:00
|
|
|
|
2017-02-27 15:10:34 +08:00
|
|
|
if (new_roots) {
|
|
|
|
if (!maybe_fs_roots(new_roots))
|
|
|
|
goto out_free;
|
2015-04-16 15:37:33 +08:00
|
|
|
nr_new_roots = new_roots->nnodes;
|
2017-02-27 15:10:34 +08:00
|
|
|
}
|
|
|
|
if (old_roots) {
|
|
|
|
if (!maybe_fs_roots(old_roots))
|
|
|
|
goto out_free;
|
2015-04-16 15:37:33 +08:00
|
|
|
nr_old_roots = old_roots->nnodes;
|
2017-02-27 15:10:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Quick exit, either not fs tree roots, or won't affect any qgroup */
|
|
|
|
if (nr_old_roots == 0 && nr_new_roots == 0)
|
|
|
|
goto out_free;
|
2015-04-16 15:37:33 +08:00
|
|
|
|
|
|
|
BUG_ON(!fs_info->quota_root);
|
|
|
|
|
2018-05-03 09:59:02 +08:00
|
|
|
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
|
|
|
|
num_bytes, nr_old_roots, nr_new_roots);
|
2016-03-29 17:19:55 -07:00
|
|
|
|
2015-04-16 15:37:33 +08:00
|
|
|
qgroups = ulist_alloc(GFP_NOFS);
|
|
|
|
if (!qgroups) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
tmp = ulist_alloc(GFP_NOFS);
|
|
|
|
if (!tmp) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
ret = 0;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
seq = fs_info->qgroup_seq;
|
|
|
|
|
|
|
|
/* Update old refcnts using old_roots */
|
|
|
|
ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
|
|
|
|
UPDATE_OLD);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Update new refcnts using new_roots */
|
|
|
|
ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
|
|
|
|
UPDATE_NEW);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
|
|
|
|
num_bytes, seq);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bump qgroup_seq to avoid seq overlap
|
|
|
|
*/
|
|
|
|
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
out_free:
|
|
|
|
ulist_free(tmp);
|
|
|
|
ulist_free(qgroups);
|
|
|
|
ulist_free(old_roots);
|
|
|
|
ulist_free(new_roots);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-15 16:00:25 +02:00
|
|
|
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
|
2015-04-16 15:37:33 +08:00
|
|
|
{
|
2018-03-15 16:00:25 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-04-16 15:37:33 +08:00
|
|
|
struct btrfs_qgroup_extent_record *record;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
struct ulist *new_roots = NULL;
|
|
|
|
struct rb_node *node;
|
2018-09-27 14:42:29 +08:00
|
|
|
u64 num_dirty_extents = 0;
|
2015-04-20 09:53:50 +08:00
|
|
|
u64 qgroup_to_skip;
|
2015-04-16 15:37:33 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
2015-04-20 09:53:50 +08:00
|
|
|
qgroup_to_skip = delayed_refs->qgroup_to_skip;
|
2015-04-16 15:37:33 +08:00
|
|
|
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
|
|
|
|
record = rb_entry(node, struct btrfs_qgroup_extent_record,
|
|
|
|
node);
|
|
|
|
|
2018-09-27 14:42:29 +08:00
|
|
|
num_dirty_extents++;
|
2016-06-09 17:27:55 -04:00
|
|
|
trace_btrfs_qgroup_account_extents(fs_info, record);
|
2016-03-29 17:19:55 -07:00
|
|
|
|
2015-04-16 15:37:33 +08:00
|
|
|
if (!ret) {
|
2017-02-27 15:10:35 +08:00
|
|
|
/*
|
|
|
|
* Old roots should be searched when inserting qgroup
|
|
|
|
* extent record
|
|
|
|
*/
|
|
|
|
if (WARN_ON(!record->old_roots)) {
|
|
|
|
/* Search commit root to find old_roots */
|
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info,
|
|
|
|
record->bytenr, 0,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:45 -04:00
|
|
|
&record->old_roots, false);
|
2017-02-27 15:10:35 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG]
Btrfs/139 will fail with a high probability if the testing machine (VM)
has only 2G RAM.
Resulting the final write success while it should fail due to EDQUOT,
and the fs will have quota exceeding the limit by 16K.
The simplified reproducer will be: (needs a 2G ram VM)
$ mkfs.btrfs -f $dev
$ mount $dev $mnt
$ btrfs subv create $mnt/subv
$ btrfs quota enable $mnt
$ btrfs quota rescan -w $mnt
$ btrfs qgroup limit -e 1G $mnt/subv
$ for i in $(seq -w 1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
done
$ sync
$ btrfs qgroup show -pcre --raw $mnt
The last pwrite will not trigger EDQUOT and final 'qgroup show' will
show something like:
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16384 16384 none none --- ---
0/256 1073758208 1073758208 none 1073741824 --- ---
And 1073758208 is larger than
> 1073741824.
[CAUSE]
It's a bug in btrfs qgroup data reserved space management.
For quota limit, we must ensure that:
reserved (data + metadata) + rfer/excl <= limit
Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.
One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get updated.
Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.
However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.
The related events will be something like:
file 1 written
btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
cleanup_ref_head: num_bytes=54947840
cleanup_ref_head: num_bytes=5636096
cleanup_ref_head: num_bytes=569344
cleanup_ref_head: num_bytes=57344
cleanup_ref_head: num_bytes=8192
^^^^^^^^^^^^^^^^ This will free qgroup data reserved space
file 2 written
...
file 8 written
cleanup_ref_head: num_bytes=8192
...
btrfs_commit_transaction <<< the only transaction committed during
the test
When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.
This allows us to write more data beyond qgroup limit.
In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.
[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.
Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:12 +08:00
|
|
|
/* Free the reserved data space */
|
|
|
|
btrfs_qgroup_free_refroot(fs_info,
|
|
|
|
record->data_rsv_refroot,
|
|
|
|
record->data_rsv,
|
|
|
|
BTRFS_QGROUP_RSV_DATA);
|
2015-04-16 15:37:33 +08:00
|
|
|
/*
|
2017-03-16 10:04:34 -06:00
|
|
|
* Use SEQ_LAST as time_seq to do special search, which
|
2015-04-16 15:37:33 +08:00
|
|
|
* doesn't lock tree or delayed_refs and search current
|
|
|
|
* root. It's safe inside commit_transaction().
|
|
|
|
*/
|
|
|
|
ret = btrfs_find_all_roots(trans, fs_info,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:45 -04:00
|
|
|
record->bytenr, SEQ_LAST, &new_roots, false);
|
2015-04-16 15:37:33 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
2017-02-27 15:10:35 +08:00
|
|
|
if (qgroup_to_skip) {
|
2015-04-20 09:53:50 +08:00
|
|
|
ulist_del(new_roots, qgroup_to_skip, 0);
|
2017-02-27 15:10:35 +08:00
|
|
|
ulist_del(record->old_roots, qgroup_to_skip,
|
|
|
|
0);
|
|
|
|
}
|
2018-07-18 14:45:39 +08:00
|
|
|
ret = btrfs_qgroup_account_extent(trans, record->bytenr,
|
|
|
|
record->num_bytes,
|
|
|
|
record->old_roots,
|
|
|
|
new_roots);
|
2015-04-16 15:37:33 +08:00
|
|
|
record->old_roots = NULL;
|
|
|
|
new_roots = NULL;
|
|
|
|
}
|
|
|
|
cleanup:
|
|
|
|
ulist_free(record->old_roots);
|
|
|
|
ulist_free(new_roots);
|
|
|
|
new_roots = NULL;
|
|
|
|
rb_erase(node, &delayed_refs->dirty_extent_root);
|
|
|
|
kfree(record);
|
|
|
|
|
|
|
|
}
|
2018-09-27 14:42:29 +08:00
|
|
|
trace_qgroup_num_dirty_extents(fs_info, trans->transid,
|
|
|
|
num_dirty_extents);
|
2015-04-16 15:37:33 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* called from commit_transaction. Writes all changed qgroups to disk.
|
|
|
|
*/
|
2018-07-18 14:45:40 +08:00
|
|
|
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 14:45:40 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root)
|
2018-01-31 10:52:04 +02:00
|
|
|
return ret;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
while (!list_empty(&fs_info->dirty_qgroups)) {
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
qgroup = list_first_entry(&fs_info->dirty_qgroups,
|
|
|
|
struct btrfs_qgroup, dirty);
|
|
|
|
list_del_init(&qgroup->dirty);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2018-07-18 14:45:28 +08:00
|
|
|
ret = update_qgroup_info_item(trans, qgroup);
|
2014-11-20 21:04:56 -05:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |=
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2018-07-18 14:45:27 +08:00
|
|
|
ret = update_qgroup_limit_item(trans, qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |=
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
}
|
2016-09-02 15:40:02 -04:00
|
|
|
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2012-06-28 18:03:02 +02:00
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
|
|
|
|
else
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2018-07-18 14:45:29 +08:00
|
|
|
ret = update_qgroup_status_item(trans);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-05-19 21:18:45 -04:00
|
|
|
* Copy the accounting information between qgroups. This is necessary
|
2016-03-30 17:57:48 -07:00
|
|
|
* when a snapshot or a subvolume is created. Throwing an error will
|
|
|
|
* cause a transaction abort so we take extra care here to only error
|
|
|
|
* when a readonly fs is a reasonable outcome.
|
2012-06-28 18:03:02 +02:00
|
|
|
*/
|
2018-07-18 14:45:41 +08:00
|
|
|
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
|
|
|
|
u64 objectid, struct btrfs_qgroup_inherit *inherit)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
u64 *i_qgroups;
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 17:31:24 +08:00
|
|
|
bool committing = false;
|
2018-07-18 14:45:41 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2018-11-19 16:20:34 +00:00
|
|
|
struct btrfs_root *quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *srcgroup;
|
|
|
|
struct btrfs_qgroup *dstgroup;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-04-02 14:37:35 +08:00
|
|
|
bool need_rescan = false;
|
2012-06-28 18:03:02 +02:00
|
|
|
u32 level_size = 0;
|
2013-04-07 10:50:19 +00:00
|
|
|
u64 nums;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 17:31:24 +08:00
|
|
|
/*
|
|
|
|
* There are only two callers of this function.
|
|
|
|
*
|
|
|
|
* One in create_subvol() in the ioctl context, which needs to hold
|
|
|
|
* the qgroup_ioctl_lock.
|
|
|
|
*
|
|
|
|
* The other one in create_pending_snapshot() where no other qgroup
|
|
|
|
* code can modify the fs as they all need to either start a new trans
|
|
|
|
* or hold a trans handler, thus we don't need to hold
|
|
|
|
* qgroup_ioctl_lock.
|
|
|
|
* This would avoid long and complex lock chain and make lockdep happy.
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
|
|
|
|
committing = true;
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
|
|
|
|
if (!committing)
|
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2013-04-07 10:50:16 +00:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-11-19 16:20:34 +00:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 10:50:16 +00:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 10:50:19 +00:00
|
|
|
if (inherit) {
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
|
|
|
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
|
|
|
|
2 * inherit->num_excl_copies;
|
|
|
|
for (i = 0; i < nums; ++i) {
|
|
|
|
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
|
2014-11-11 07:18:22 -05:00
|
|
|
|
2016-03-30 17:57:48 -07:00
|
|
|
/*
|
|
|
|
* Zero out invalid groups so we can ignore
|
|
|
|
* them later.
|
|
|
|
*/
|
|
|
|
if (!srcgroup ||
|
|
|
|
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
|
|
|
|
*i_qgroups = 0ULL;
|
|
|
|
|
2013-04-07 10:50:19 +00:00
|
|
|
++i_qgroups;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* create a tracking group for the subvol itself
|
|
|
|
*/
|
|
|
|
ret = add_qgroup_item(trans, quota_root, objectid);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* add qgroup to all inherited groups
|
|
|
|
*/
|
|
|
|
if (inherit) {
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
2016-03-30 17:57:48 -07:00
|
|
|
for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
|
|
|
|
if (*i_qgroups == 0)
|
|
|
|
continue;
|
2018-07-18 14:45:24 +08:00
|
|
|
ret = add_qgroup_relation_item(trans, objectid,
|
|
|
|
*i_qgroups);
|
2016-03-30 17:57:48 -07:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
2018-07-18 14:45:24 +08:00
|
|
|
ret = add_qgroup_relation_item(trans, *i_qgroups,
|
|
|
|
objectid);
|
2016-03-30 17:57:48 -07:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
2016-03-30 17:57:48 -07:00
|
|
|
ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
|
|
|
|
dstgroup = add_qgroup_rb(fs_info, objectid);
|
2012-07-30 02:15:43 -06:00
|
|
|
if (IS_ERR(dstgroup)) {
|
|
|
|
ret = PTR_ERR(dstgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
2012-07-30 02:15:43 -06:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2014-11-20 20:58:34 -05:00
|
|
|
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
|
|
|
|
dstgroup->lim_flags = inherit->lim.flags;
|
|
|
|
dstgroup->max_rfer = inherit->lim.max_rfer;
|
|
|
|
dstgroup->max_excl = inherit->lim.max_excl;
|
|
|
|
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
|
|
|
|
dstgroup->rsv_excl = inherit->lim.rsv_excl;
|
2014-11-20 21:01:41 -05:00
|
|
|
|
2018-07-18 14:45:27 +08:00
|
|
|
ret = update_qgroup_limit_item(trans, dstgroup);
|
2014-11-20 21:01:41 -05:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2016-09-20 10:05:00 -04:00
|
|
|
btrfs_info(fs_info,
|
|
|
|
"unable to update quota limit for %llu",
|
|
|
|
dstgroup->qgroupid);
|
2014-11-20 21:01:41 -05:00
|
|
|
goto unlock;
|
|
|
|
}
|
2014-11-20 20:58:34 -05:00
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
if (srcid) {
|
|
|
|
srcgroup = find_qgroup_rb(fs_info, srcid);
|
2012-09-14 20:06:30 -04:00
|
|
|
if (!srcgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
2014-05-13 17:30:47 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We call inherit after we clone the root in order to make sure
|
|
|
|
* our counts don't go crazy, so at this point the only
|
|
|
|
* difference between the two roots should be the root node.
|
|
|
|
*/
|
2018-07-17 16:58:22 +08:00
|
|
|
level_size = fs_info->nodesize;
|
2014-05-13 17:30:47 -07:00
|
|
|
dstgroup->rfer = srcgroup->rfer;
|
|
|
|
dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
|
|
|
|
dstgroup->excl = level_size;
|
|
|
|
dstgroup->excl_cmpr = level_size;
|
2012-06-28 18:03:02 +02:00
|
|
|
srcgroup->excl = level_size;
|
|
|
|
srcgroup->excl_cmpr = level_size;
|
2014-11-20 20:14:38 -05:00
|
|
|
|
|
|
|
/* inherit the limit info */
|
|
|
|
dstgroup->lim_flags = srcgroup->lim_flags;
|
|
|
|
dstgroup->max_rfer = srcgroup->max_rfer;
|
|
|
|
dstgroup->max_excl = srcgroup->max_excl;
|
|
|
|
dstgroup->rsv_rfer = srcgroup->rsv_rfer;
|
|
|
|
dstgroup->rsv_excl = srcgroup->rsv_excl;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
qgroup_dirty(fs_info, dstgroup);
|
|
|
|
qgroup_dirty(fs_info, srcgroup);
|
|
|
|
}
|
|
|
|
|
2012-09-14 20:06:30 -04:00
|
|
|
if (!inherit)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
|
|
|
for (i = 0; i < inherit->num_qgroups; ++i) {
|
2016-03-30 17:57:48 -07:00
|
|
|
if (*i_qgroups) {
|
2016-06-22 18:54:23 -04:00
|
|
|
ret = add_relation_rb(fs_info, objectid, *i_qgroups);
|
2016-03-30 17:57:48 -07:00
|
|
|
if (ret)
|
|
|
|
goto unlock;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
++i_qgroups;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-04-02 14:37:35 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're doing a snapshot, and adding the snapshot to a new
|
|
|
|
* qgroup, the numbers are guaranteed to be incorrect.
|
|
|
|
*/
|
|
|
|
if (srcid)
|
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2016-03-30 17:57:48 -07:00
|
|
|
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *src;
|
|
|
|
struct btrfs_qgroup *dst;
|
|
|
|
|
2016-03-30 17:57:48 -07:00
|
|
|
if (!i_qgroups[0] || !i_qgroups[1])
|
|
|
|
continue;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
src = find_qgroup_rb(fs_info, i_qgroups[0]);
|
|
|
|
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
|
|
|
|
|
|
|
|
if (!src || !dst) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst->rfer = src->rfer - level_size;
|
|
|
|
dst->rfer_cmpr = src->rfer_cmpr - level_size;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-04-02 14:37:35 +08:00
|
|
|
|
|
|
|
/* Manually tweaking numbers certainly needs a rescan */
|
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
2016-03-30 17:57:48 -07:00
|
|
|
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *src;
|
|
|
|
struct btrfs_qgroup *dst;
|
|
|
|
|
2016-03-30 17:57:48 -07:00
|
|
|
if (!i_qgroups[0] || !i_qgroups[1])
|
|
|
|
continue;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
src = find_qgroup_rb(fs_info, i_qgroups[0]);
|
|
|
|
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
|
|
|
|
|
|
|
|
if (!src || !dst) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst->excl = src->excl + level_size;
|
|
|
|
dst->excl_cmpr = src->excl_cmpr + level_size;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-04-02 14:37:35 +08:00
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
unlock:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2020-06-28 13:07:15 +08:00
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
out:
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-13 17:31:24 +08:00
|
|
|
if (!committing)
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-04-02 14:37:35 +08:00
|
|
|
if (need_rescan)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-07-13 18:50:49 +08:00
|
|
|
static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
|
2017-01-25 09:50:33 -05:00
|
|
|
{
|
|
|
|
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
|
2017-12-12 15:34:25 +08:00
|
|
|
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
|
2017-01-25 09:50:33 -05:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
|
2017-12-12 15:34:25 +08:00
|
|
|
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
|
2017-01-25 09:50:33 -05:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-12-12 15:34:25 +08:00
|
|
|
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
u64 ref_root = root->root_key.objectid;
|
|
|
|
int ret = 0;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
|
|
|
|
if (!is_fstree(ref_root))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return 0;
|
2017-05-11 21:17:33 +00:00
|
|
|
|
|
|
|
if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
|
|
|
|
capable(CAP_SYS_RESOURCE))
|
|
|
|
enforce = false;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* in a first step, we check all affected qgroups if any limits would
|
|
|
|
* be exceeded
|
|
|
|
*/
|
2013-05-06 11:03:27 +00:00
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2013-04-17 14:00:36 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 11:03:27 +00:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2020-07-13 18:50:49 +08:00
|
|
|
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = -EDQUOT;
|
2013-03-06 11:51:47 +00:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
2013-05-06 11:03:27 +00:00
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2013-04-17 14:00:36 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
}
|
2013-04-17 14:00:36 +00:00
|
|
|
ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* no limits exceeded, now record the reservation into all qgroups
|
|
|
|
*/
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 11:03:27 +00:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_add(fs_info, qg, num_bytes, type);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-12-12 15:34:30 +08:00
|
|
|
/*
|
|
|
|
* Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
|
|
|
|
* qgroup).
|
|
|
|
*
|
|
|
|
* Will handle all higher level qgroup too.
|
|
|
|
*
|
|
|
|
* NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
|
|
|
|
* This special case is only used for META_PERTRANS type.
|
|
|
|
*/
|
2015-09-08 17:08:37 +08:00
|
|
|
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
|
2017-12-12 15:34:23 +08:00
|
|
|
u64 ref_root, u64 num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
2013-04-17 14:00:36 +00:00
|
|
|
int ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
if (!is_fstree(ref_root))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
|
|
|
|
2017-12-12 15:34:30 +08:00
|
|
|
if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
|
|
|
|
WARN(1, "%s: Invalid type to free", __func__);
|
|
|
|
return;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
2017-12-12 15:34:30 +08:00
|
|
|
if (num_bytes == (u64)-1)
|
2017-12-12 15:34:34 +08:00
|
|
|
/*
|
|
|
|
* We're freeing all pertrans rsv, get reserved value from
|
|
|
|
* level 0 qgroup as real num_bytes to free.
|
|
|
|
*/
|
2017-12-12 15:34:30 +08:00
|
|
|
num_bytes = qgroup->rsv.values[type];
|
|
|
|
|
2013-05-06 11:03:27 +00:00
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2013-04-17 14:00:36 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 11:03:27 +00:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-12-12 15:34:27 +08:00
|
|
|
qgroup_rsv_release(fs_info, qg, num_bytes, type);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
2013-05-06 11:03:27 +00:00
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2013-04-17 14:00:36 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 09:38:13 +08:00
|
|
|
/*
|
|
|
|
* Check if the leaf is the last leaf. Which means all node pointers
|
|
|
|
* are at their last position.
|
|
|
|
*/
|
|
|
|
static bool is_last_leaf(struct btrfs_path *path)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
|
|
|
|
if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-04-25 16:04:51 +00:00
|
|
|
/*
|
|
|
|
* returns < 0 on error, 0 when more leafs are to be scanned.
|
2015-02-27 16:24:24 +08:00
|
|
|
* returns 1 when done.
|
2013-04-25 16:04:51 +00:00
|
|
|
*/
|
2018-07-18 14:45:42 +08:00
|
|
|
static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path)
|
2013-04-25 16:04:51 +00:00
|
|
|
{
|
2018-07-18 14:45:42 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-04-25 16:04:51 +00:00
|
|
|
struct btrfs_key found;
|
2015-10-26 09:19:43 +08:00
|
|
|
struct extent_buffer *scratch_leaf = NULL;
|
2013-04-25 16:04:51 +00:00
|
|
|
struct ulist *roots = NULL;
|
2014-05-13 17:30:47 -07:00
|
|
|
u64 num_bytes;
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 09:38:13 +08:00
|
|
|
bool done;
|
2013-04-25 16:04:51 +00:00
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
ret = btrfs_search_slot_for_read(fs_info->extent_root,
|
|
|
|
&fs_info->qgroup_rescan_progress,
|
|
|
|
path, 1, 0);
|
|
|
|
|
2016-09-20 10:05:02 -04:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"current progress key (%llu %u %llu), search_slot ret %d",
|
|
|
|
fs_info->qgroup_rescan_progress.objectid,
|
|
|
|
fs_info->qgroup_rescan_progress.type,
|
|
|
|
fs_info->qgroup_rescan_progress.offset, ret);
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* The rescan is about to end, we will not be scanning any
|
|
|
|
* further blocks. We cannot unset the RESCAN flag here, because
|
|
|
|
* we want to commit the transaction if everything went well.
|
|
|
|
* To make the live accounting work in this phase, we set our
|
|
|
|
* scan progress pointer such that every real extent objectid
|
|
|
|
* will be smaller.
|
|
|
|
*/
|
|
|
|
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 09:38:13 +08:00
|
|
|
done = is_last_leaf(path);
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &found,
|
|
|
|
btrfs_header_nritems(path->nodes[0]) - 1);
|
|
|
|
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
|
|
|
|
|
2015-10-26 09:19:43 +08:00
|
|
|
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
|
|
|
|
if (!scratch_leaf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
2013-04-25 16:04:51 +00:00
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_release_path(path);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
|
|
|
|
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
|
2014-01-23 16:45:10 -05:00
|
|
|
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
found.type != BTRFS_METADATA_ITEM_KEY)
|
2013-04-25 16:04:51 +00:00
|
|
|
continue;
|
2014-01-23 16:45:10 -05:00
|
|
|
if (found.type == BTRFS_METADATA_ITEM_KEY)
|
2016-06-15 09:22:56 -04:00
|
|
|
num_bytes = fs_info->nodesize;
|
2014-01-23 16:45:10 -05:00
|
|
|
else
|
|
|
|
num_bytes = found.offset;
|
|
|
|
|
2014-05-13 17:30:47 -07:00
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:45 -04:00
|
|
|
&roots, false);
|
2013-04-25 16:04:51 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2015-04-13 11:02:16 +08:00
|
|
|
/* For rescan, just pass old_roots as NULL */
|
2018-07-18 14:45:39 +08:00
|
|
|
ret = btrfs_qgroup_account_extent(trans, found.objectid,
|
|
|
|
num_bytes, NULL, roots);
|
2015-04-13 11:02:16 +08:00
|
|
|
if (ret < 0)
|
2014-05-13 17:30:47 -07:00
|
|
|
goto out;
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
out:
|
2018-08-15 18:26:56 +03:00
|
|
|
if (scratch_leaf)
|
2015-10-26 09:19:43 +08:00
|
|
|
free_extent_buffer(scratch_leaf);
|
2013-04-25 16:04:51 +00:00
|
|
|
|
2018-06-27 18:19:55 +08:00
|
|
|
if (done && !ret) {
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 09:38:13 +08:00
|
|
|
ret = 1;
|
2018-06-27 18:19:55 +08:00
|
|
|
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
|
|
|
|
}
|
2013-04-25 16:04:51 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-02-28 10:46:19 +08:00
|
|
|
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
|
2013-04-25 16:04:51 +00:00
|
|
|
{
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
|
|
|
|
qgroup_rescan_work);
|
2013-04-25 16:04:51 +00:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
|
|
|
int err = -ENOMEM;
|
2015-02-27 16:24:25 +08:00
|
|
|
int ret = 0;
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
goto out;
|
2018-05-14 09:38:12 +08:00
|
|
|
/*
|
|
|
|
* Rescan should only search for commit root, and any later difference
|
|
|
|
* should be recorded by qgroup
|
|
|
|
*/
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
path->skip_locking = 1;
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
err = 0;
|
2015-11-04 15:56:16 -08:00
|
|
|
while (!err && !btrfs_fs_closing(fs_info)) {
|
2013-04-25 16:04:51 +00:00
|
|
|
trans = btrfs_start_transaction(fs_info->fs_root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
break;
|
|
|
|
}
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
|
2013-04-25 16:04:51 +00:00
|
|
|
err = -EINTR;
|
|
|
|
} else {
|
2018-07-18 14:45:42 +08:00
|
|
|
err = qgroup_rescan_leaf(trans, path);
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
if (err > 0)
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_commit_transaction(trans);
|
2013-04-25 16:04:51 +00:00
|
|
|
else
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
2015-02-27 16:24:24 +08:00
|
|
|
if (err > 0 &&
|
2013-04-25 16:04:51 +00:00
|
|
|
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
} else if (err < 0) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
}
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
2015-02-27 16:24:25 +08:00
|
|
|
/*
|
2016-05-19 21:18:45 -04:00
|
|
|
* only update status, since the previous part has already updated the
|
2015-02-27 16:24:25 +08:00
|
|
|
* qgroup info.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(fs_info->quota_root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
Btrfs: fix race setting up and completing qgroup rescan workers
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
--- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
+++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
@@ -1,2 +1,3 @@
QA output created by 171
+ERROR: quota rescan failed: Operation now in progress
Silence is golden
...
(Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-09-24 10:49:54 +01:00
|
|
|
trans = NULL;
|
2015-02-27 16:24:25 +08:00
|
|
|
btrfs_err(fs_info,
|
2017-07-13 15:32:18 +02:00
|
|
|
"fail to start transaction for status update: %d",
|
2015-02-27 16:24:25 +08:00
|
|
|
err);
|
|
|
|
}
|
Btrfs: fix race setting up and completing qgroup rescan workers
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
--- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
+++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
@@ -1,2 +1,3 @@
QA output created by 171
+ERROR: quota rescan failed: Operation now in progress
Silence is golden
...
(Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-09-24 10:49:54 +01:00
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
if (!btrfs_fs_closing(fs_info))
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
if (trans) {
|
|
|
|
ret = update_qgroup_status_item(trans);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
btrfs_err(fs_info, "fail to update qgroup status: %d",
|
|
|
|
err);
|
|
|
|
}
|
2015-02-27 16:24:25 +08:00
|
|
|
}
|
Btrfs: fix race setting up and completing qgroup rescan workers
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
--- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
+++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
@@ -1,2 +1,3 @@
QA output created by 171
+ERROR: quota rescan failed: Operation now in progress
Silence is golden
...
(Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-09-24 10:49:54 +01:00
|
|
|
fs_info->qgroup_rescan_running = false;
|
|
|
|
complete_all(&fs_info->qgroup_rescan_completion);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
if (!trans)
|
|
|
|
return;
|
|
|
|
|
2016-09-09 21:39:03 -04:00
|
|
|
btrfs_end_transaction(trans);
|
2015-02-27 16:24:25 +08:00
|
|
|
|
2015-11-04 15:56:16 -08:00
|
|
|
if (btrfs_fs_closing(fs_info)) {
|
|
|
|
btrfs_info(fs_info, "qgroup scan paused");
|
|
|
|
} else if (err >= 0) {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_info(fs_info, "qgroup scan completed%s",
|
2015-02-27 16:24:24 +08:00
|
|
|
err > 0 ? " (inconsistency flag cleared)" : "");
|
2013-04-25 16:04:51 +00:00
|
|
|
} else {
|
2013-12-20 11:37:06 -05:00
|
|
|
btrfs_err(fs_info, "qgroup scan failed with %d", err);
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
/*
|
|
|
|
* Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
|
|
|
|
* memory required for the rescan context.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
|
|
|
|
int init_flags)
|
2013-04-25 16:04:51 +00:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2018-05-02 13:28:03 +08:00
|
|
|
if (!init_flags) {
|
|
|
|
/* we're resuming qgroup rescan at mount time */
|
2018-06-27 00:43:15 +01:00
|
|
|
if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
|
2018-05-02 13:28:03 +08:00
|
|
|
btrfs_warn(fs_info,
|
2019-11-18 14:16:44 +02:00
|
|
|
"qgroup rescan init failed, qgroup rescan is not queued");
|
2018-06-27 00:43:15 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
} else if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_ON)) {
|
2018-05-02 13:28:03 +08:00
|
|
|
btrfs_warn(fs_info,
|
2019-11-18 14:16:44 +02:00
|
|
|
"qgroup rescan init failed, qgroup is not enabled");
|
2018-06-27 00:43:15 +01:00
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
}
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
|
|
|
|
if (init_flags) {
|
2018-05-02 13:28:03 +08:00
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"qgroup rescan is already in progress");
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
ret = -EINPROGRESS;
|
2018-05-02 13:28:03 +08:00
|
|
|
} else if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_ON)) {
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"qgroup rescan init failed, qgroup is not enabled");
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
ret = -EINVAL;
|
2018-05-02 13:28:03 +08:00
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
2018-05-02 13:28:03 +08:00
|
|
|
return ret;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
}
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
memset(&fs_info->qgroup_rescan_progress, 0,
|
|
|
|
sizeof(fs_info->qgroup_rescan_progress));
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
|
2015-11-05 10:06:23 +00:00
|
|
|
init_completion(&fs_info->qgroup_rescan_completion);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
2014-02-28 10:46:16 +08:00
|
|
|
btrfs_init_work(&fs_info->qgroup_rescan_work,
|
|
|
|
btrfs_qgroup_rescan_worker, NULL, NULL);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct rb_node *n;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2013-04-25 16:04:51 +00:00
|
|
|
/* clear all current qgroup tracking information */
|
|
|
|
for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
qgroup->rfer = 0;
|
|
|
|
qgroup->rfer_cmpr = 0;
|
|
|
|
qgroup->excl = 0;
|
|
|
|
qgroup->excl_cmpr = 0;
|
2018-08-10 10:20:26 +08:00
|
|
|
qgroup_dirty(fs_info, qgroup);
|
2013-04-25 16:04:51 +00:00
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
}
|
2013-04-25 16:04:51 +00:00
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
int
|
|
|
|
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
|
|
|
ret = qgroup_rescan_init(fs_info, 0, 1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have set the rescan_progress to 0, which means no more
|
|
|
|
* delayed refs will be accounted by btrfs_qgroup_account_ref.
|
|
|
|
* However, btrfs_qgroup_account_ref may be right after its call
|
|
|
|
* to btrfs_find_all_roots, in which case it would still do the
|
|
|
|
* accounting.
|
|
|
|
* To solve this, we're committing the transaction, which will
|
|
|
|
* ensure we run all delayed refs and only after that, we are
|
|
|
|
* going to clear all tracking information for a clean start.
|
|
|
|
*/
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(fs_info->fs_root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
2016-09-09 21:39:03 -04:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
qgroup_rescan_zero_tracking(fs_info);
|
|
|
|
|
2020-02-07 13:38:20 +08:00
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
fs_info->qgroup_rescan_running = true;
|
2014-02-28 10:46:16 +08:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
2020-02-07 13:38:20 +08:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
2013-04-25 16:04:51 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2013-05-06 19:14:17 +00:00
|
|
|
|
2016-08-08 22:08:06 -04:00
|
|
|
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
|
|
|
|
bool interruptible)
|
2013-05-06 19:14:17 +00:00
|
|
|
{
|
|
|
|
int running;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
2016-08-15 12:10:33 -04:00
|
|
|
running = fs_info->qgroup_rescan_running;
|
2013-05-06 19:14:17 +00:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
2016-08-08 22:08:06 -04:00
|
|
|
if (!running)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (interruptible)
|
2013-05-06 19:14:17 +00:00
|
|
|
ret = wait_for_completion_interruptible(
|
|
|
|
&fs_info->qgroup_rescan_completion);
|
2016-08-08 22:08:06 -04:00
|
|
|
else
|
|
|
|
wait_for_completion(&fs_info->qgroup_rescan_completion);
|
2013-05-06 19:14:17 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is only called from open_ctree where we're still single threaded, thus
|
|
|
|
* locking is omitted here.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2020-02-07 13:38:20 +08:00
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
fs_info->qgroup_rescan_running = true;
|
2014-02-28 10:46:16 +08:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
2020-02-07 13:38:20 +08:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 15:47:24 +00:00
|
|
|
}
|
2015-10-12 16:05:40 +08:00
|
|
|
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
#define rbtree_iterate_from_safe(node, next, start) \
|
|
|
|
for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
|
|
|
|
|
|
|
|
static int qgroup_unreserve_range(struct btrfs_inode *inode,
|
|
|
|
struct extent_changeset *reserved, u64 start,
|
|
|
|
u64 len)
|
|
|
|
{
|
|
|
|
struct rb_node *node;
|
|
|
|
struct rb_node *next;
|
2020-10-23 14:26:33 +03:00
|
|
|
struct ulist_node *entry;
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
node = reserved->range_changed.root.rb_node;
|
2020-10-23 14:26:33 +03:00
|
|
|
if (!node)
|
|
|
|
return 0;
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
while (node) {
|
|
|
|
entry = rb_entry(node, struct ulist_node, rb_node);
|
|
|
|
if (entry->val < start)
|
|
|
|
node = node->rb_right;
|
|
|
|
else
|
2020-10-23 14:26:33 +03:00
|
|
|
node = node->rb_left;
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (entry->val > start && rb_prev(&entry->rb_node))
|
|
|
|
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
|
|
|
|
rb_node);
|
|
|
|
|
|
|
|
rbtree_iterate_from_safe(node, next, &entry->rb_node) {
|
|
|
|
u64 entry_start;
|
|
|
|
u64 entry_end;
|
|
|
|
u64 entry_len;
|
|
|
|
int clear_ret;
|
|
|
|
|
|
|
|
entry = rb_entry(node, struct ulist_node, rb_node);
|
|
|
|
entry_start = entry->val;
|
|
|
|
entry_end = entry->aux;
|
|
|
|
entry_len = entry_end - entry_start + 1;
|
|
|
|
|
|
|
|
if (entry_start >= start + len)
|
|
|
|
break;
|
|
|
|
if (entry_start + entry_len <= start)
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Now the entry is in [start, start + len), revert the
|
|
|
|
* EXTENT_QGROUP_RESERVED bit.
|
|
|
|
*/
|
|
|
|
clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
|
|
|
|
entry_end, EXTENT_QGROUP_RESERVED);
|
|
|
|
if (!ret && clear_ret < 0)
|
|
|
|
ret = clear_ret;
|
|
|
|
|
|
|
|
ulist_del(&reserved->range_changed, entry->val, entry->aux);
|
|
|
|
if (likely(reserved->bytes_changed >= entry_len)) {
|
|
|
|
reserved->bytes_changed -= entry_len;
|
|
|
|
} else {
|
|
|
|
WARN_ON(1);
|
|
|
|
reserved->bytes_changed = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-10-12 16:05:40 +08:00
|
|
|
/*
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
* Try to free some space for qgroup.
|
2015-10-12 16:05:40 +08:00
|
|
|
*
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
* For qgroup, there are only 3 ways to free qgroup space:
|
|
|
|
* - Flush nodatacow write
|
|
|
|
* Any nodatacow write will free its reserved data space at run_delalloc_range().
|
|
|
|
* In theory, we should only flush nodatacow inodes, but it's not yet
|
|
|
|
* possible, so we need to flush the whole root.
|
2015-10-12 16:05:40 +08:00
|
|
|
*
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
* - Wait for ordered extents
|
|
|
|
* When ordered extents are finished, their reserved metadata is finally
|
|
|
|
* converted to per_trans status, which can be freed by later commit
|
|
|
|
* transaction.
|
2015-10-12 16:05:40 +08:00
|
|
|
*
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
* - Commit transaction
|
|
|
|
* This would free the meta_per_trans space.
|
|
|
|
* In theory this shouldn't provide much space, but any more qgroup space
|
|
|
|
* is needed.
|
2015-10-12 16:05:40 +08:00
|
|
|
*/
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
static int try_flush_qgroup(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to run flush again and again, so if there is a running
|
|
|
|
* one, we won't try to start a new flush, but exit directly.
|
|
|
|
*/
|
|
|
|
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
|
|
|
|
wait_event(root->qgroup_flush_wait,
|
|
|
|
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_start_delalloc_snapshot(root);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
out:
|
|
|
|
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
|
|
|
|
wake_up(&root->qgroup_flush_wait);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int qgroup_reserve_data(struct btrfs_inode *inode,
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset **reserved_ret, u64 start,
|
|
|
|
u64 len)
|
2015-10-12 16:05:40 +08:00
|
|
|
{
|
2020-06-03 08:55:37 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset *reserved;
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
bool new_reserved = false;
|
2017-02-27 15:10:38 +08:00
|
|
|
u64 orig_reserved;
|
|
|
|
u64 to_reserve;
|
2015-10-12 16:05:40 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-02 15:40:02 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
|
2018-08-06 14:25:24 +09:00
|
|
|
!is_fstree(root->root_key.objectid) || len == 0)
|
2015-10-12 16:05:40 +08:00
|
|
|
return 0;
|
|
|
|
|
2017-02-27 15:10:38 +08:00
|
|
|
/* @reserved parameter is mandatory for qgroup */
|
|
|
|
if (WARN_ON(!reserved_ret))
|
|
|
|
return -EINVAL;
|
|
|
|
if (!*reserved_ret) {
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
new_reserved = true;
|
2017-02-27 15:10:38 +08:00
|
|
|
*reserved_ret = extent_changeset_alloc();
|
|
|
|
if (!*reserved_ret)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
reserved = *reserved_ret;
|
|
|
|
/* Record already reserved space */
|
|
|
|
orig_reserved = reserved->bytes_changed;
|
2020-06-03 08:55:37 +03:00
|
|
|
ret = set_record_extent_bits(&inode->io_tree, start,
|
2017-02-27 15:10:38 +08:00
|
|
|
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
|
|
|
|
|
|
|
|
/* Newly reserved space */
|
|
|
|
to_reserve = reserved->bytes_changed - orig_reserved;
|
2020-06-03 08:55:37 +03:00
|
|
|
trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
|
2017-02-27 15:10:38 +08:00
|
|
|
to_reserve, QGROUP_RESERVE);
|
2015-10-12 16:05:40 +08:00
|
|
|
if (ret < 0)
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
goto out;
|
2017-12-12 15:34:25 +08:00
|
|
|
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
|
2015-10-12 16:05:40 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
cleanup:
|
btrfs: qgroup: allow to unreserve range without releasing other ranges
[PROBLEM]
Before this patch, when btrfs_qgroup_reserve_data() fails, we free all
reserved space of the changeset.
For example:
ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M);
ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the
entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally
go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data();
if (ret == -EDQUOT) {
/* Do something to free some qgroup space */
ret = btrfs_qgroup_reserve_data();
}
As if the first btrfs_qgroup_reserve_data() fails, it will free all
reserved qgroup space.
[CAUSE]
This is because we release all reserved ranges when
btrfs_qgroup_reserve_data() fails.
[FIX]
This patch will implement a new function, qgroup_unreserve_range(), to
iterate through the ulist nodes, to find any nodes in the failure range,
and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and
decrease the extent_changeset::bytes_changed, so that we can revert to
previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT
happens.
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-08 14:24:45 +08:00
|
|
|
qgroup_unreserve_range(inode, reserved, start, len);
|
|
|
|
out:
|
|
|
|
if (new_reserved) {
|
|
|
|
extent_changeset_release(reserved);
|
|
|
|
kfree(reserved);
|
|
|
|
*reserved_ret = NULL;
|
|
|
|
}
|
2015-10-12 16:05:40 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2015-10-12 16:28:06 +08:00
|
|
|
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
/*
|
|
|
|
* Reserve qgroup space for range [start, start + len).
|
|
|
|
*
|
|
|
|
* This function will either reserve space from related qgroups or do nothing
|
|
|
|
* if the range is already reserved.
|
|
|
|
*
|
|
|
|
* Return 0 for successful reservation
|
|
|
|
* Return <0 for error (including -EQUOT)
|
|
|
|
*
|
|
|
|
* NOTE: This function may sleep for memory allocation, dirty page flushing and
|
|
|
|
* commit transaction. So caller should not hold any dirty page locked.
|
|
|
|
*/
|
|
|
|
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
|
|
|
|
struct extent_changeset **reserved_ret, u64 start,
|
|
|
|
u64 len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = qgroup_reserve_data(inode, reserved_ret, start, len);
|
|
|
|
if (ret <= 0 && ret != -EDQUOT)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = try_flush_qgroup(inode->root);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return qgroup_reserve_data(inode, reserved_ret, start, len);
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
/* Free ranges specified by @reserved, normally in error path */
|
2020-06-03 08:55:09 +03:00
|
|
|
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
struct extent_changeset *reserved, u64 start, u64 len)
|
|
|
|
{
|
2020-06-03 08:55:09 +03:00
|
|
|
struct btrfs_root *root = inode->root;
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct extent_changeset changeset;
|
|
|
|
int freed = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
extent_changeset_init(&changeset);
|
|
|
|
len = round_up(start + len, root->fs_info->sectorsize);
|
|
|
|
start = round_down(start, root->fs_info->sectorsize);
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
|
|
|
|
u64 range_start = unode->val;
|
|
|
|
/* unode->aux is the inclusive end */
|
|
|
|
u64 range_len = unode->aux - range_start + 1;
|
|
|
|
u64 free_start;
|
|
|
|
u64 free_len;
|
|
|
|
|
|
|
|
extent_changeset_release(&changeset);
|
|
|
|
|
|
|
|
/* Only free range in range [start, start + len) */
|
|
|
|
if (range_start >= start + len ||
|
|
|
|
range_start + range_len <= start)
|
|
|
|
continue;
|
|
|
|
free_start = max(range_start, start);
|
|
|
|
free_len = min(start + len, range_start + range_len) -
|
|
|
|
free_start;
|
|
|
|
/*
|
|
|
|
* TODO: To also modify reserved->ranges_reserved to reflect
|
|
|
|
* the modification.
|
|
|
|
*
|
|
|
|
* However as long as we free qgroup reserved according to
|
|
|
|
* EXTENT_QGROUP_RESERVED, we won't double free.
|
|
|
|
* So not need to rush.
|
|
|
|
*/
|
2020-06-03 08:55:09 +03:00
|
|
|
ret = clear_record_extent_bits(&inode->io_tree, free_start,
|
|
|
|
free_start + free_len - 1,
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
EXTENT_QGROUP_RESERVED, &changeset);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
freed += changeset.bytes_changed;
|
|
|
|
}
|
2018-08-06 14:25:24 +09:00
|
|
|
btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
|
2017-12-12 15:34:23 +08:00
|
|
|
BTRFS_QGROUP_RSV_DATA);
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
ret = freed;
|
|
|
|
out:
|
|
|
|
extent_changeset_release(&changeset);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-06-03 08:55:10 +03:00
|
|
|
static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
struct extent_changeset *reserved, u64 start, u64 len,
|
|
|
|
int free)
|
2015-10-12 16:28:06 +08:00
|
|
|
{
|
|
|
|
struct extent_changeset changeset;
|
2015-09-28 16:57:53 +08:00
|
|
|
int trace_op = QGROUP_RELEASE;
|
2015-10-12 16:28:06 +08:00
|
|
|
int ret;
|
|
|
|
|
2020-06-03 08:55:10 +03:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
|
2018-10-09 14:36:45 +08:00
|
|
|
return 0;
|
|
|
|
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
/* In release case, we shouldn't have @reserved */
|
|
|
|
WARN_ON(!free && reserved);
|
|
|
|
if (free && reserved)
|
2020-06-03 08:55:10 +03:00
|
|
|
return qgroup_free_reserved_data(inode, reserved, start, len);
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_init(&changeset);
|
2020-06-03 08:55:10 +03:00
|
|
|
ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
|
|
|
|
EXTENT_QGROUP_RESERVED, &changeset);
|
2015-10-12 16:28:06 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2017-03-13 15:52:09 +08:00
|
|
|
if (free)
|
2015-09-28 16:57:53 +08:00
|
|
|
trace_op = QGROUP_FREE;
|
2020-06-03 08:55:10 +03:00
|
|
|
trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
|
2015-09-28 16:57:53 +08:00
|
|
|
changeset.bytes_changed, trace_op);
|
2017-03-13 15:52:09 +08:00
|
|
|
if (free)
|
2020-06-03 08:55:10 +03:00
|
|
|
btrfs_qgroup_free_refroot(inode->root->fs_info,
|
|
|
|
inode->root->root_key.objectid,
|
2017-12-12 15:34:23 +08:00
|
|
|
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
|
2017-02-27 15:10:36 +08:00
|
|
|
ret = changeset.bytes_changed;
|
2015-10-12 16:28:06 +08:00
|
|
|
out:
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_release(&changeset);
|
2015-10-12 16:28:06 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a reserved space range from io_tree and related qgroups
|
|
|
|
*
|
|
|
|
* Should be called when a range of pages get invalidated before reaching disk.
|
|
|
|
* Or for error cleanup case.
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
* if @reserved is given, only reserved range in [@start, @start + @len) will
|
|
|
|
* be freed.
|
2015-10-12 16:28:06 +08:00
|
|
|
*
|
|
|
|
* For data written to disk, use btrfs_qgroup_release_data().
|
|
|
|
*
|
|
|
|
* NOTE: This function may sleep for memory allocation.
|
|
|
|
*/
|
2020-06-03 08:55:11 +03:00
|
|
|
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
|
|
|
struct extent_changeset *reserved, u64 start, u64 len)
|
2015-10-12 16:28:06 +08:00
|
|
|
{
|
2020-06-03 08:55:11 +03:00
|
|
|
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
|
2015-10-12 16:28:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Release a reserved space range from io_tree only.
|
|
|
|
*
|
|
|
|
* Should be called when a range of pages get written to disk and corresponding
|
|
|
|
* FILE_EXTENT is inserted into corresponding root.
|
|
|
|
*
|
|
|
|
* Since new qgroup accounting framework will only update qgroup numbers at
|
|
|
|
* commit_transaction() time, its reserved space shouldn't be freed from
|
|
|
|
* related qgroups.
|
|
|
|
*
|
|
|
|
* But we should release the range from io_tree, to allow further write to be
|
|
|
|
* COWed.
|
|
|
|
*
|
|
|
|
* NOTE: This function may sleep for memory allocation.
|
|
|
|
*/
|
2020-06-03 08:55:18 +03:00
|
|
|
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
|
2015-10-12 16:28:06 +08:00
|
|
|
{
|
2020-06-03 08:55:18 +03:00
|
|
|
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
|
2015-10-12 16:28:06 +08:00
|
|
|
}
|
2015-09-08 17:08:38 +08:00
|
|
|
|
2017-12-12 15:34:34 +08:00
|
|
|
static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
|
|
|
|
type != BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return;
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&root->qgroup_meta_rsv_lock);
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
|
|
|
|
root->qgroup_meta_rsv_prealloc += num_bytes;
|
|
|
|
else
|
|
|
|
root->qgroup_meta_rsv_pertrans += num_bytes;
|
|
|
|
spin_unlock(&root->qgroup_meta_rsv_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
|
|
|
|
type != BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return 0;
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
spin_lock(&root->qgroup_meta_rsv_lock);
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
|
|
|
|
num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
|
|
|
|
num_bytes);
|
|
|
|
root->qgroup_meta_rsv_prealloc -= num_bytes;
|
|
|
|
} else {
|
|
|
|
num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
|
|
|
|
num_bytes);
|
|
|
|
root->qgroup_meta_rsv_pertrans -= num_bytes;
|
|
|
|
}
|
|
|
|
spin_unlock(&root->qgroup_meta_rsv_lock);
|
|
|
|
return num_bytes;
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
enum btrfs_qgroup_rsv_type type, bool enforce)
|
2015-09-08 17:08:38 +08:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-09-08 17:08:38 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2018-08-06 14:25:24 +09:00
|
|
|
!is_fstree(root->root_key.objectid) || num_bytes == 0)
|
2015-09-08 17:08:38 +08:00
|
|
|
return 0;
|
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
|
2019-10-17 10:38:36 +08:00
|
|
|
trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
ret = qgroup_reserve(root, num_bytes, enforce, type);
|
2015-09-08 17:08:38 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2017-12-12 15:34:34 +08:00
|
|
|
/*
|
|
|
|
* Record what we have reserved into root.
|
|
|
|
*
|
|
|
|
* To avoid quota disabled->enabled underflow.
|
|
|
|
* In that case, we may try to free space we haven't reserved
|
|
|
|
* (since quota was disabled), so record what we reserved into root.
|
|
|
|
* And ensure later release won't underflow this number.
|
|
|
|
*/
|
|
|
|
add_root_meta_rsv(root, num_bytes, type);
|
2015-09-08 17:08:38 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: try to flush qgroup space when we get -EDQUOT
[PROBLEM]
There are known problem related to how btrfs handles qgroup reserved
space. One of the most obvious case is the the test case btrfs/153,
which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad)
--- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800
+++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800
@@ -1,2 +1,5 @@
QA output created by 153
+pwrite: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
+/mnt/scratch/testfile2: Disk quota exceeded
Silence is golden
...
(Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE]
Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"),
we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not
a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already,
and since we also reserve data space for qgroup at buffered write time,
it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check,
qgroup reserved space is shared between data/metadata.
The EDQUOT can happen at the metadata reservation, so doing NODATACOW
check after qgroup reservation failure is not a solution.
[FIX]
To solve the problem, we don't return -EDQUOT directly, but every time
we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range().
However we don't have the infrastructure to only flush NODATACOW
inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans
metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce
a per-root wait list and a new root status, to ensure only one thread
starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-13 18:50:48 +08:00
|
|
|
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type, bool enforce)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
|
|
|
|
if (ret <= 0 && ret != -EDQUOT)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = try_flush_qgroup(root);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return qgroup_reserve_meta(root, num_bytes, type, enforce);
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
|
2015-09-08 17:08:38 +08:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-09-08 17:08:38 +08:00
|
|
|
|
2016-06-22 18:54:23 -04:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2018-08-06 14:25:24 +09:00
|
|
|
!is_fstree(root->root_key.objectid))
|
2015-09-08 17:08:38 +08:00
|
|
|
return;
|
|
|
|
|
2017-12-12 15:34:30 +08:00
|
|
|
/* TODO: Update trace point to handle such free */
|
2017-12-12 15:34:35 +08:00
|
|
|
trace_qgroup_meta_free_all_pertrans(root);
|
2017-12-12 15:34:30 +08:00
|
|
|
/* Special value -1 means to free all reserved space */
|
2018-08-06 14:25:24 +09:00
|
|
|
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
BTRFS_QGROUP_RSV_META_PERTRANS);
|
2015-09-08 17:08:38 +08:00
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:29 +08:00
|
|
|
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2015-09-08 17:08:38 +08:00
|
|
|
{
|
2016-06-22 18:54:23 -04:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2018-08-06 14:25:24 +09:00
|
|
|
!is_fstree(root->root_key.objectid))
|
2015-09-08 17:08:38 +08:00
|
|
|
return;
|
|
|
|
|
2017-12-12 15:34:34 +08:00
|
|
|
/*
|
|
|
|
* reservation for META_PREALLOC can happen before quota is enabled,
|
|
|
|
* which can lead to underflow.
|
|
|
|
* Here ensure we will only free what we really have reserved.
|
|
|
|
*/
|
|
|
|
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
|
2016-06-22 18:54:23 -04:00
|
|
|
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
|
2019-10-17 10:38:36 +08:00
|
|
|
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
|
2018-08-06 14:25:24 +09:00
|
|
|
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
|
|
|
|
num_bytes, type);
|
2015-09-08 17:08:38 +08:00
|
|
|
}
|
2015-10-13 09:53:10 +08:00
|
|
|
|
2017-12-12 15:34:31 +08:00
|
|
|
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
|
|
|
|
int num_bytes)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
2019-11-25 21:58:50 -03:00
|
|
|
if (!fs_info->quota_root)
|
2017-12-12 15:34:31 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2017-12-12 15:34:31 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
|
|
|
qg = unode_aux_to_qgroup(unode);
|
|
|
|
|
|
|
|
qgroup_rsv_release(fs_info, qg, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PREALLOC);
|
|
|
|
qgroup_rsv_add(fs_info, qg, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PERTRANS);
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2017-12-12 15:34:31 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2018-08-06 14:25:24 +09:00
|
|
|
!is_fstree(root->root_key.objectid))
|
2017-12-12 15:34:31 +08:00
|
|
|
return;
|
2017-12-12 15:34:34 +08:00
|
|
|
/* Same as btrfs_qgroup_free_meta_prealloc() */
|
|
|
|
num_bytes = sub_root_meta_rsv(root, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PREALLOC);
|
2017-12-12 15:34:35 +08:00
|
|
|
trace_qgroup_meta_convert(root, num_bytes);
|
2018-08-06 14:25:24 +09:00
|
|
|
qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
|
2017-12-12 15:34:31 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 09:53:10 +08:00
|
|
|
/*
|
2016-05-19 21:18:45 -04:00
|
|
|
* Check qgroup reserved space leaking, normally at destroy inode
|
2015-10-13 09:53:10 +08:00
|
|
|
* time
|
|
|
|
*/
|
2020-06-03 08:55:46 +03:00
|
|
|
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
|
2015-10-13 09:53:10 +08:00
|
|
|
{
|
|
|
|
struct extent_changeset changeset;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator iter;
|
|
|
|
int ret;
|
|
|
|
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_init(&changeset);
|
2020-06-03 08:55:46 +03:00
|
|
|
ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
|
2016-04-26 23:54:39 +02:00
|
|
|
EXTENT_QGROUP_RESERVED, &changeset);
|
2015-10-13 09:53:10 +08:00
|
|
|
|
|
|
|
WARN_ON(ret < 0);
|
|
|
|
if (WARN_ON(changeset.bytes_changed)) {
|
|
|
|
ULIST_ITER_INIT(&iter);
|
2017-02-13 13:42:29 +01:00
|
|
|
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
|
2020-06-03 08:55:46 +03:00
|
|
|
btrfs_warn(inode->root->fs_info,
|
|
|
|
"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
|
|
|
|
btrfs_ino(inode), unode->val, unode->aux);
|
2015-10-13 09:53:10 +08:00
|
|
|
}
|
2020-06-03 08:55:46 +03:00
|
|
|
btrfs_qgroup_free_refroot(inode->root->fs_info,
|
|
|
|
inode->root->root_key.objectid,
|
2017-12-12 15:34:23 +08:00
|
|
|
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
|
2017-02-13 14:24:35 +01:00
|
|
|
|
2015-10-13 09:53:10 +08:00
|
|
|
}
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_release(&changeset);
|
2015-10-13 09:53:10 +08:00
|
|
|
}
|
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root
information about which tree blocks get swapped. This patch introduces
the required infrastructure.
The designed workflow will be:
1) Record the subtree root block that gets swapped.
During subtree swap:
O = Old tree blocks
N = New tree blocks
reloc tree subvolume tree X
Root Root
/ \ / \
NA OB OA OB
/ | | \ / | | \
NC ND OE OF OC OD OE OF
In this case, NA and OA are going to be swapped, record (NA, OA) into
subvolume tree X.
2) After subtree swap.
reloc tree subvolume tree X
Root Root
/ \ / \
OA OB NA OB
/ | | \ / | | \
OC OD OE OF NC ND OE OF
3a) COW happens for OB
If we are going to COW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
If it doesn't fit any, nothing will happen.
3b) COW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtrees OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
Then no matter what we do to subvolume tree X, qgroup numbers will
still be correct.
Then NA's record gets removed from X's swapped_blocks.
4) Transaction commit
Any record in X's swapped_blocks gets removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.
This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled. This is to reduce memory allocations and potential
failures.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:16 +08:00
|
|
|
|
|
|
|
void btrfs_qgroup_init_swapped_blocks(
|
|
|
|
struct btrfs_qgroup_swapped_blocks *swapped_blocks)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
spin_lock_init(&swapped_blocks->lock);
|
|
|
|
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
|
|
|
|
swapped_blocks->blocks[i] = RB_ROOT;
|
|
|
|
swapped_blocks->swapped = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Delete all swapped blocks record of @root.
|
|
|
|
* Every record here means we skipped a full subtree scan for qgroup.
|
|
|
|
*
|
|
|
|
* Gets called when committing one transaction.
|
|
|
|
*/
|
|
|
|
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup_swapped_blocks *swapped_blocks;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
swapped_blocks = &root->swapped_blocks;
|
|
|
|
|
|
|
|
spin_lock(&swapped_blocks->lock);
|
|
|
|
if (!swapped_blocks->swapped)
|
|
|
|
goto out;
|
|
|
|
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
|
|
|
|
struct rb_root *cur_root = &swapped_blocks->blocks[i];
|
|
|
|
struct btrfs_qgroup_swapped_block *entry;
|
|
|
|
struct btrfs_qgroup_swapped_block *next;
|
|
|
|
|
|
|
|
rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
|
|
|
|
node)
|
|
|
|
kfree(entry);
|
|
|
|
swapped_blocks->blocks[i] = RB_ROOT;
|
|
|
|
}
|
|
|
|
swapped_blocks->swapped = false;
|
|
|
|
out:
|
|
|
|
spin_unlock(&swapped_blocks->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add subtree roots record into @subvol_root.
|
|
|
|
*
|
|
|
|
* @subvol_root: tree root of the subvolume tree get swapped
|
|
|
|
* @bg: block group under balance
|
|
|
|
* @subvol_parent/slot: pointer to the subtree root in subvolume tree
|
|
|
|
* @reloc_parent/slot: pointer to the subtree root in reloc tree
|
|
|
|
* BOTH POINTERS ARE BEFORE TREE SWAP
|
|
|
|
* @last_snapshot: last snapshot generation of the subvolume tree
|
|
|
|
*/
|
|
|
|
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *subvol_root,
|
2019-10-29 19:20:18 +01:00
|
|
|
struct btrfs_block_group *bg,
|
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root
information about which tree blocks get swapped. This patch introduces
the required infrastructure.
The designed workflow will be:
1) Record the subtree root block that gets swapped.
During subtree swap:
O = Old tree blocks
N = New tree blocks
reloc tree subvolume tree X
Root Root
/ \ / \
NA OB OA OB
/ | | \ / | | \
NC ND OE OF OC OD OE OF
In this case, NA and OA are going to be swapped, record (NA, OA) into
subvolume tree X.
2) After subtree swap.
reloc tree subvolume tree X
Root Root
/ \ / \
OA OB NA OB
/ | | \ / | | \
OC OD OE OF NC ND OE OF
3a) COW happens for OB
If we are going to COW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
If it doesn't fit any, nothing will happen.
3b) COW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtrees OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
Then no matter what we do to subvolume tree X, qgroup numbers will
still be correct.
Then NA's record gets removed from X's swapped_blocks.
4) Transaction commit
Any record in X's swapped_blocks gets removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.
This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled. This is to reduce memory allocations and potential
failures.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:16 +08:00
|
|
|
struct extent_buffer *subvol_parent, int subvol_slot,
|
|
|
|
struct extent_buffer *reloc_parent, int reloc_slot,
|
|
|
|
u64 last_snapshot)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
|
|
|
|
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
|
|
|
|
struct btrfs_qgroup_swapped_block *block;
|
|
|
|
struct rb_node **cur;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
int level = btrfs_header_level(subvol_parent) - 1;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
|
|
|
|
btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
|
|
|
|
__func__,
|
|
|
|
btrfs_node_ptr_generation(subvol_parent, subvol_slot),
|
|
|
|
btrfs_node_ptr_generation(reloc_parent, reloc_slot));
|
|
|
|
return -EUCLEAN;
|
|
|
|
}
|
|
|
|
|
|
|
|
block = kmalloc(sizeof(*block), GFP_NOFS);
|
|
|
|
if (!block) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* @reloc_parent/slot is still before swap, while @block is going to
|
|
|
|
* record the bytenr after swap, so we do the swap here.
|
|
|
|
*/
|
|
|
|
block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
|
|
|
|
block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
|
|
|
|
reloc_slot);
|
|
|
|
block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
|
|
|
|
block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
|
|
|
|
subvol_slot);
|
|
|
|
block->last_snapshot = last_snapshot;
|
|
|
|
block->level = level;
|
btrfs: qgroup: Check bg while resuming relocation to avoid NULL pointer dereference
[BUG]
When mounting a fs with reloc tree and has qgroup enabled, it can cause
NULL pointer dereference at mount time:
BUG: kernel NULL pointer dereference, address: 00000000000000a8
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
RIP: 0010:btrfs_qgroup_add_swapped_blocks+0x186/0x300 [btrfs]
Call Trace:
replace_path.isra.23+0x685/0x900 [btrfs]
merge_reloc_root+0x26e/0x5f0 [btrfs]
merge_reloc_roots+0x10a/0x1a0 [btrfs]
btrfs_recover_relocation+0x3cd/0x420 [btrfs]
open_ctree+0x1bc8/0x1ed0 [btrfs]
btrfs_mount_root+0x544/0x680 [btrfs]
legacy_get_tree+0x34/0x60
vfs_get_tree+0x2d/0xf0
fc_mount+0x12/0x40
vfs_kern_mount.part.12+0x61/0xa0
vfs_kern_mount+0x13/0x20
btrfs_mount+0x16f/0x860 [btrfs]
legacy_get_tree+0x34/0x60
vfs_get_tree+0x2d/0xf0
do_mount+0x81f/0xac0
ksys_mount+0xbf/0xe0
__x64_sys_mount+0x25/0x30
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
[CAUSE]
In btrfs_recover_relocation(), we don't have enough info to determine
which block group we're relocating, but only to merge existing reloc
trees.
Thus in btrfs_recover_relocation(), rc->block_group is NULL.
btrfs_qgroup_add_swapped_blocks() hasn't taken this into consideration,
and causes a NULL pointer dereference.
The bug is introduced by commit 3d0174f78e72 ("btrfs: qgroup: Only trace
data extents in leaves if we're relocating data block group"), and
later qgroup refactoring still keeps this optimization.
[FIX]
Thankfully in the context of btrfs_recover_relocation(), there is no
other progress can modify tree blocks, thus those swapped tree blocks
pair will never affect qgroup numbers, no matter whatever we set for
block->trace_leaf.
So we only need to check if @bg is NULL before accessing @bg->flags.
Reported-by: Juan Erbes <jerbes@gmail.com>
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1134806
Fixes: 3d0174f78e72 ("btrfs: qgroup: Only trace data extents in leaves if we're relocating data block group")
CC: stable@vger.kernel.org # 4.20+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-21 19:28:08 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have bg == NULL, we're called from btrfs_recover_relocation(),
|
|
|
|
* no one else can modify tree blocks thus we qgroup will not change
|
|
|
|
* no matter the value of trace_leaf.
|
|
|
|
*/
|
|
|
|
if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
|
btrfs: qgroup: Introduce per-root swapped blocks infrastructure
To allow delayed subtree swap rescan, btrfs needs to record per-root
information about which tree blocks get swapped. This patch introduces
the required infrastructure.
The designed workflow will be:
1) Record the subtree root block that gets swapped.
During subtree swap:
O = Old tree blocks
N = New tree blocks
reloc tree subvolume tree X
Root Root
/ \ / \
NA OB OA OB
/ | | \ / | | \
NC ND OE OF OC OD OE OF
In this case, NA and OA are going to be swapped, record (NA, OA) into
subvolume tree X.
2) After subtree swap.
reloc tree subvolume tree X
Root Root
/ \ / \
OA OB NA OB
/ | | \ / | | \
OC OD OE OF NC ND OE OF
3a) COW happens for OB
If we are going to COW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
If it doesn't fit any, nothing will happen.
3b) COW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtrees OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
Then no matter what we do to subvolume tree X, qgroup numbers will
still be correct.
Then NA's record gets removed from X's swapped_blocks.
4) Transaction commit
Any record in X's swapped_blocks gets removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.
This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled. This is to reduce memory allocations and potential
failures.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:16 +08:00
|
|
|
block->trace_leaf = true;
|
|
|
|
else
|
|
|
|
block->trace_leaf = false;
|
|
|
|
btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
|
|
|
|
|
|
|
|
/* Insert @block into @blocks */
|
|
|
|
spin_lock(&blocks->lock);
|
|
|
|
cur = &blocks->blocks[level].rb_node;
|
|
|
|
while (*cur) {
|
|
|
|
struct btrfs_qgroup_swapped_block *entry;
|
|
|
|
|
|
|
|
parent = *cur;
|
|
|
|
entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
|
|
|
|
node);
|
|
|
|
|
|
|
|
if (entry->subvol_bytenr < block->subvol_bytenr) {
|
|
|
|
cur = &(*cur)->rb_left;
|
|
|
|
} else if (entry->subvol_bytenr > block->subvol_bytenr) {
|
|
|
|
cur = &(*cur)->rb_right;
|
|
|
|
} else {
|
|
|
|
if (entry->subvol_generation !=
|
|
|
|
block->subvol_generation ||
|
|
|
|
entry->reloc_bytenr != block->reloc_bytenr ||
|
|
|
|
entry->reloc_generation !=
|
|
|
|
block->reloc_generation) {
|
|
|
|
/*
|
|
|
|
* Duplicated but mismatch entry found.
|
|
|
|
* Shouldn't happen.
|
|
|
|
*
|
|
|
|
* Marking qgroup inconsistent should be enough
|
|
|
|
* for end users.
|
|
|
|
*/
|
|
|
|
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
|
|
|
|
ret = -EEXIST;
|
|
|
|
}
|
|
|
|
kfree(block);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rb_link_node(&block->node, parent, cur);
|
|
|
|
rb_insert_color(&block->node, &blocks->blocks[level]);
|
|
|
|
blocks->swapped = true;
|
|
|
|
out_unlock:
|
|
|
|
spin_unlock(&blocks->lock);
|
|
|
|
out:
|
|
|
|
if (ret < 0)
|
|
|
|
fs_info->qgroup_flags |=
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
return ret;
|
|
|
|
}
|
btrfs: qgroup: Use delayed subtree rescan for balance
Before this patch, qgroup code traces the whole subtree of subvolume and
reloc trees unconditionally.
This makes qgroup numbers consistent, but it could cause tons of
unnecessary extent tracing, which causes a lot of overhead.
However for subtree swap of balance, just swap both subtrees because
they contain the same contents and tree structure, so qgroup numbers
won't change.
It's the race window between subtree swap and transaction commit could
cause qgroup number change.
This patch will delay the qgroup subtree scan until COW happens for the
subtree root.
So if there is no other operations for the fs, balance won't cause extra
qgroup overhead. (best case scenario)
Depending on the workload, most of the subtree scan can still be
avoided.
Only for worst case scenario, it will fall back to old subtree swap
overhead. (scan all swapped subtrees)
[[Benchmark]]
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.
Mkfs parameter:
--nodesize 4K (To bump up tree size)
Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)
Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.
balance parameter:
-m
So the content should be pretty similar to a real world root fs layout.
And after file system population, there is no other activity, so it
should be the best case scenario.
| v4.20-rc1 | w/ patchset | diff
-----------------------------------------------------------------------
relocated extents | 22615 | 22457 | -0.1%
qgroup dirty extents | 163457 | 121606 | -25.6%
time (sys) | 22.884s | 18.842s | -17.6%
time (real) | 27.724s | 22.884s | -17.5%
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-23 15:15:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the tree block is a subtree root, and if so do the needed
|
|
|
|
* delayed subtree trace for qgroup.
|
|
|
|
*
|
|
|
|
* This is called during btrfs_cow_block().
|
|
|
|
*/
|
|
|
|
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct extent_buffer *subvol_eb)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
|
|
|
|
struct btrfs_qgroup_swapped_block *block;
|
|
|
|
struct extent_buffer *reloc_eb = NULL;
|
|
|
|
struct rb_node *node;
|
|
|
|
bool found = false;
|
|
|
|
bool swapped = false;
|
|
|
|
int level = btrfs_header_level(subvol_eb);
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
|
|
|
return 0;
|
|
|
|
if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
spin_lock(&blocks->lock);
|
|
|
|
if (!blocks->swapped) {
|
|
|
|
spin_unlock(&blocks->lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
node = blocks->blocks[level].rb_node;
|
|
|
|
|
|
|
|
while (node) {
|
|
|
|
block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
|
|
|
|
if (block->subvol_bytenr < subvol_eb->start) {
|
|
|
|
node = node->rb_left;
|
|
|
|
} else if (block->subvol_bytenr > subvol_eb->start) {
|
|
|
|
node = node->rb_right;
|
|
|
|
} else {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
spin_unlock(&blocks->lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/* Found one, remove it from @blocks first and update blocks->swapped */
|
|
|
|
rb_erase(&block->node, &blocks->blocks[level]);
|
|
|
|
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
|
|
|
|
if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
|
|
|
|
swapped = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
blocks->swapped = swapped;
|
|
|
|
spin_unlock(&blocks->lock);
|
|
|
|
|
|
|
|
/* Read out reloc subtree root */
|
|
|
|
reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
|
|
|
|
block->reloc_generation, block->level,
|
|
|
|
&block->first_key);
|
|
|
|
if (IS_ERR(reloc_eb)) {
|
|
|
|
ret = PTR_ERR(reloc_eb);
|
|
|
|
reloc_eb = NULL;
|
|
|
|
goto free_out;
|
|
|
|
}
|
|
|
|
if (!extent_buffer_uptodate(reloc_eb)) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto free_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
|
|
|
|
block->last_snapshot, block->trace_leaf);
|
|
|
|
free_out:
|
|
|
|
kfree(block);
|
|
|
|
free_extent_buffer(reloc_eb);
|
|
|
|
out:
|
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"failed to account subtree at bytenr %llu: %d",
|
|
|
|
subvol_eb->start, ret);
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2020-02-11 15:25:37 +08:00
|
|
|
|
|
|
|
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup_extent_record *entry;
|
|
|
|
struct btrfs_qgroup_extent_record *next;
|
|
|
|
struct rb_root *root;
|
|
|
|
|
|
|
|
root = &trans->delayed_refs.dirty_extent_root;
|
|
|
|
rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
|
|
|
|
ulist_free(entry->old_roots);
|
|
|
|
kfree(entry);
|
|
|
|
}
|
|
|
|
}
|