iocost: protect iocg->abs_vdebt with iocg->waitq.lock

abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup
is and controls the activation of use_delay mechanism. Once a cgroup goes
over budget from forced IOs, it has to pay it back with its future budget.
The progress guarantee on debt paying comes from the iocg being active -
active iocgs are processed by the periodic timer, which ensures that as time
passes the debts dissipate and the iocg returns to normal operation.

However, both iocg activation and vdebt handling are asynchronous and a
sequence like the following may happen.

1. The iocg is in the process of being deactivated by the periodic timer.

2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns
   without anything because it still sees that the iocg is already active.

3. The iocg is deactivated.

4. The bio from #2 is over budget but needs to be forced. It increases
   abs_vdebt and goes over the threshold and enables use_delay.

5. IO control is enabled for the iocg's subtree and now IOs are attributed
   to the descendant cgroups and the iocg itself no longer issues IOs.

This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no
further IOs which can activate it. This can end up unduly punishing all the
descendants cgroups.

The usual throttling path has the same issue - the iocg must be active while
throttled to ensure that future event will wake it up - and solves the
problem by synchronizing the throttling path with a spinlock. abs_vdebt
handling is another form of overage handling and shares a lot of
characteristics including the fact that it isn't in the hottest path.

This patch fixes the above and other possible races by strictly
synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Vlad Dmitriev <vvd@fb.com>
Cc: stable@vger.kernel.org # v5.4+
Fixes: e1518f63f2 ("blk-iocost: Don't let merges push vtime into the future")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Tejun Heo 2020-05-04 19:27:54 -04:00 committed by Jens Axboe
parent 10c70d95c0
commit 0b80f9866e
2 changed files with 77 additions and 47 deletions

View File

@ -466,7 +466,7 @@ struct ioc_gq {
*/ */
atomic64_t vtime; atomic64_t vtime;
atomic64_t done_vtime; atomic64_t done_vtime;
atomic64_t abs_vdebt; u64 abs_vdebt;
u64 last_vtime; u64 last_vtime;
/* /*
@ -1142,7 +1142,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
struct iocg_wake_ctx ctx = { .iocg = iocg }; struct iocg_wake_ctx ctx = { .iocg = iocg };
u64 margin_ns = (u64)(ioc->period_us * u64 margin_ns = (u64)(ioc->period_us *
WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
u64 abs_vdebt, vdebt, vshortage, expires, oexpires; u64 vdebt, vshortage, expires, oexpires;
s64 vbudget; s64 vbudget;
u32 hw_inuse; u32 hw_inuse;
@ -1152,18 +1152,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
vbudget = now->vnow - atomic64_read(&iocg->vtime); vbudget = now->vnow - atomic64_read(&iocg->vtime);
/* pay off debt */ /* pay off debt */
abs_vdebt = atomic64_read(&iocg->abs_vdebt); vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
if (vdebt && vbudget > 0) { if (vdebt && vbudget > 0) {
u64 delta = min_t(u64, vbudget, vdebt); u64 delta = min_t(u64, vbudget, vdebt);
u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
abs_vdebt); iocg->abs_vdebt);
atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->vtime);
atomic64_add(delta, &iocg->done_vtime); atomic64_add(delta, &iocg->done_vtime);
atomic64_sub(abs_delta, &iocg->abs_vdebt); iocg->abs_vdebt -= abs_delta;
if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
atomic64_set(&iocg->abs_vdebt, 0);
} }
/* /*
@ -1219,12 +1216,18 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
u64 expires, oexpires; u64 expires, oexpires;
u32 hw_inuse; u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock);
/* debt-adjust vtime */ /* debt-adjust vtime */
current_hweight(iocg, NULL, &hw_inuse); current_hweight(iocg, NULL, &hw_inuse);
vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
/* clear or maintain depending on the overage */ /*
if (time_before_eq64(vtime, now->vnow)) { * Clear or maintain depending on the overage. Non-zero vdebt is what
* guarantees that @iocg is online and future iocg_kick_delay() will
* clear use_delay. Don't leave it on when there's no vdebt.
*/
if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
blkcg_clear_delay(blkg); blkcg_clear_delay(blkg);
return false; return false;
} }
@ -1258,9 +1261,12 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
{ {
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
struct ioc_now now; struct ioc_now now;
unsigned long flags;
spin_lock_irqsave(&iocg->waitq.lock, flags);
ioc_now(iocg->ioc, &now); ioc_now(iocg->ioc, &now);
iocg_kick_delay(iocg, &now, 0); iocg_kick_delay(iocg, &now, 0);
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
@ -1368,14 +1374,13 @@ static void ioc_timer_fn(struct timer_list *timer)
* should have woken up in the last period and expire idle iocgs. * should have woken up in the last period and expire idle iocgs.
*/ */
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
if (!waitqueue_active(&iocg->waitq) && if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
!atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) !iocg_is_idle(iocg))
continue; continue;
spin_lock(&iocg->waitq.lock); spin_lock(&iocg->waitq.lock);
if (waitqueue_active(&iocg->waitq) || if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
atomic64_read(&iocg->abs_vdebt)) {
/* might be oversleeping vtime / hweight changes, kick */ /* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now); iocg_kick_waitq(iocg, &now);
iocg_kick_delay(iocg, &now, 0); iocg_kick_delay(iocg, &now, 0);
@ -1718,28 +1723,49 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* tests are racy but the races aren't systemic - we only miss once * tests are racy but the races aren't systemic - we only miss once
* in a while which is fine. * in a while which is fine.
*/ */
if (!waitqueue_active(&iocg->waitq) && if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
!atomic64_read(&iocg->abs_vdebt) &&
time_before_eq64(vtime + cost, now.vnow)) { time_before_eq64(vtime + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost); iocg_commit_bio(iocg, bio, cost);
return; return;
} }
/* /*
* We're over budget. If @bio has to be issued regardless, * We activated above but w/o any synchronization. Deactivation is
* remember the abs_cost instead of advancing vtime. * synchronized with waitq.lock and we won't get deactivated as long
* iocg_kick_waitq() will pay off the debt before waking more IOs. * as we're waiting or has debt, so we're good if we're activated
* here. In the unlikely case that we aren't, just issue the IO.
*/
spin_lock_irq(&iocg->waitq.lock);
if (unlikely(list_empty(&iocg->active_list))) {
spin_unlock_irq(&iocg->waitq.lock);
iocg_commit_bio(iocg, bio, cost);
return;
}
/*
* We're over budget. If @bio has to be issued regardless, remember
* the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
* off the debt before waking more IOs.
*
* This way, the debt is continuously paid off each period with the * This way, the debt is continuously paid off each period with the
* actual budget available to the cgroup. If we just wound vtime, * actual budget available to the cgroup. If we just wound vtime, we
* we would incorrectly use the current hw_inuse for the entire * would incorrectly use the current hw_inuse for the entire amount
* amount which, for example, can lead to the cgroup staying * which, for example, can lead to the cgroup staying blocked for a
* blocked for a long time even with substantially raised hw_inuse. * long time even with substantially raised hw_inuse.
*
* An iocg with vdebt should stay online so that the timer can keep
* deducting its vdebt and [de]activate use_delay mechanism
* accordingly. We don't want to race against the timer trying to
* clear them and leave @iocg inactive w/ dangling use_delay heavily
* penalizing the cgroup and its descendants.
*/ */
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
atomic64_add(abs_cost, &iocg->abs_vdebt); iocg->abs_vdebt += abs_cost;
if (iocg_kick_delay(iocg, &now, cost)) if (iocg_kick_delay(iocg, &now, cost))
blkcg_schedule_throttle(rqos->q, blkcg_schedule_throttle(rqos->q,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP); (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
spin_unlock_irq(&iocg->waitq.lock);
return; return;
} }
@ -1756,20 +1782,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* All waiters are on iocg->waitq and the wait states are * All waiters are on iocg->waitq and the wait states are
* synchronized using waitq.lock. * synchronized using waitq.lock.
*/ */
spin_lock_irq(&iocg->waitq.lock);
/*
* We activated above but w/o any synchronization. Deactivation is
* synchronized with waitq.lock and we won't get deactivated as
* long as we're waiting, so we're good if we're activated here.
* In the unlikely case that we are deactivated, just issue the IO.
*/
if (unlikely(list_empty(&iocg->active_list))) {
spin_unlock_irq(&iocg->waitq.lock);
iocg_commit_bio(iocg, bio, cost);
return;
}
init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
wait.wait.private = current; wait.wait.private = current;
wait.bio = bio; wait.bio = bio;
@ -1801,6 +1813,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
struct ioc_now now; struct ioc_now now;
u32 hw_inuse; u32 hw_inuse;
u64 abs_cost, cost; u64 abs_cost, cost;
unsigned long flags;
/* bypass if disabled or for root cgroup */ /* bypass if disabled or for root cgroup */
if (!ioc->enabled || !iocg->level) if (!ioc->enabled || !iocg->level)
@ -1820,15 +1833,28 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
iocg->cursor = bio_end; iocg->cursor = bio_end;
/* /*
* Charge if there's enough vtime budget and the existing request * Charge if there's enough vtime budget and the existing request has
* has cost assigned. Otherwise, account it as debt. See debt * cost assigned.
* handling in ioc_rqos_throttle() for details.
*/ */
if (rq->bio && rq->bio->bi_iocost_cost && if (rq->bio && rq->bio->bi_iocost_cost &&
time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost); iocg_commit_bio(iocg, bio, cost);
else return;
atomic64_add(abs_cost, &iocg->abs_vdebt); }
/*
* Otherwise, account it as debt if @iocg is online, which it should
* be for the vast majority of cases. See debt handling in
* ioc_rqos_throttle() for details.
*/
spin_lock_irqsave(&iocg->waitq.lock, flags);
if (likely(!list_empty(&iocg->active_list))) {
iocg->abs_vdebt += abs_cost;
iocg_kick_delay(iocg, &now, cost);
} else {
iocg_commit_bio(iocg, bio, cost);
}
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
} }
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
@ -1998,7 +2024,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
iocg->ioc = ioc; iocg->ioc = ioc;
atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->vtime, now.vnow);
atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow);
atomic64_set(&iocg->abs_vdebt, 0);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->active_list);
iocg->hweight_active = HWEIGHT_WHOLE; iocg->hweight_active = HWEIGHT_WHOLE;

View File

@ -159,7 +159,12 @@ class IocgStat:
else: else:
self.inflight_pct = 0 self.inflight_pct = 0
self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 # vdebt used to be an atomic64_t and is now u64, support both
try:
self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
except:
self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
self.use_delay = blkg.use_delay.counter.value_() self.use_delay = blkg.use_delay.counter.value_()
self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000