mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 10:43:43 +00:00
4fbf8b136d
Use atomic_try_cmpxchg() instead of atomic_cmpxchg(*ptr, old, new) == old in rcuref_put_slowpath(). On x86 the CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG. Additionaly, the compiler reorders some code blocks to follow likely/unlikely annotations in the atomic_try_cmpxchg() macro, improving the code from: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 83 f8 ff cmp $0xffffffff,%eax a1: 74 04 je a7 <rcuref_put_slowpath+0x27> a3: 31 c0 xor %eax,%eax to: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 75 4c jne ec <rcuref_put_slowpath+0x6c> a0: b0 01 mov $0x1,%al No functional change intended. Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul E. McKenney <paulmck@kernel.org> Link: https://lore.kernel.org/r/20230509150255.3691-1-ubizjak@gmail.com
282 lines
9.5 KiB
C
282 lines
9.5 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/*
|
|
* rcuref - A scalable reference count implementation for RCU managed objects
|
|
*
|
|
* rcuref is provided to replace open coded reference count implementations
|
|
* based on atomic_t. It protects explicitely RCU managed objects which can
|
|
* be visible even after the last reference has been dropped and the object
|
|
* is heading towards destruction.
|
|
*
|
|
* A common usage pattern is:
|
|
*
|
|
* get()
|
|
* rcu_read_lock();
|
|
* p = get_ptr();
|
|
* if (p && !atomic_inc_not_zero(&p->refcnt))
|
|
* p = NULL;
|
|
* rcu_read_unlock();
|
|
* return p;
|
|
*
|
|
* put()
|
|
* if (!atomic_dec_return(&->refcnt)) {
|
|
* remove_ptr(p);
|
|
* kfree_rcu((p, rcu);
|
|
* }
|
|
*
|
|
* atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
|
|
* O(N^2) behaviour under contention with N concurrent operations.
|
|
*
|
|
* rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
|
|
* better under contention.
|
|
*
|
|
* Why not refcount?
|
|
* =================
|
|
*
|
|
* In principle it should be possible to make refcount use the rcuref
|
|
* scheme, but the destruction race described below cannot be prevented
|
|
* unless the protected object is RCU managed.
|
|
*
|
|
* Theory of operation
|
|
* ===================
|
|
*
|
|
* rcuref uses an unsigned integer reference counter. As long as the
|
|
* counter value is greater than or equal to RCUREF_ONEREF and not larger
|
|
* than RCUREF_MAXREF the reference is alive:
|
|
*
|
|
* ONEREF MAXREF SATURATED RELEASED DEAD NOREF
|
|
* 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
|
|
* <---valid --------> <-------saturation zone-------> <-----dead zone----->
|
|
*
|
|
* The get() and put() operations do unconditional increments and
|
|
* decrements. The result is checked after the operation. This optimizes
|
|
* for the fast path.
|
|
*
|
|
* If the reference count is saturated or dead, then the increments and
|
|
* decrements are not harmful as the reference count still stays in the
|
|
* respective zones and is always set back to STATURATED resp. DEAD. The
|
|
* zones have room for 2^28 racing operations in each direction, which
|
|
* makes it practically impossible to escape the zones.
|
|
*
|
|
* Once the last reference is dropped the reference count becomes
|
|
* RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
|
|
* slowpath then tries to set the reference count from RCUREF_NOREF to
|
|
* RCUREF_DEAD via a cmpxchg(). This opens a small window where a
|
|
* concurrent rcuref_get() can acquire the reference count and bring it
|
|
* back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
|
|
*
|
|
* If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
|
|
* DEAD + 1, which is inside the dead zone. If that happens the reference
|
|
* count is put back to DEAD.
|
|
*
|
|
* The actual race is possible due to the unconditional increment and
|
|
* decrements in rcuref_get() and rcuref_put():
|
|
*
|
|
* T1 T2
|
|
* get() put()
|
|
* if (atomic_add_negative(-1, &ref->refcnt))
|
|
* succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
|
|
*
|
|
* atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1
|
|
*
|
|
* As the result of T1's add is negative, the get() goes into the slow path
|
|
* and observes refcnt being in the dead zone which makes the operation fail.
|
|
*
|
|
* Possible critical states:
|
|
*
|
|
* Context Counter References Operation
|
|
* T1 0 1 init()
|
|
* T2 1 2 get()
|
|
* T1 0 1 put()
|
|
* T2 -1 0 put() tries to mark dead
|
|
* T1 0 1 get()
|
|
* T2 0 1 put() mark dead fails
|
|
* T1 -1 0 put() tries to mark dead
|
|
* T1 DEAD 0 put() mark dead succeeds
|
|
* T2 DEAD+1 0 get() fails and puts it back to DEAD
|
|
*
|
|
* Of course there are more complex scenarios, but the above illustrates
|
|
* the working principle. The rest is left to the imagination of the
|
|
* reader.
|
|
*
|
|
* Deconstruction race
|
|
* ===================
|
|
*
|
|
* The release operation must be protected by prohibiting a grace period in
|
|
* order to prevent a possible use after free:
|
|
*
|
|
* T1 T2
|
|
* put() get()
|
|
* // ref->refcnt = ONEREF
|
|
* if (!atomic_add_negative(-1, &ref->refcnt))
|
|
* return false; <- Not taken
|
|
*
|
|
* // ref->refcnt == NOREF
|
|
* --> preemption
|
|
* // Elevates ref->refcnt to ONEREF
|
|
* if (!atomic_add_negative(1, &ref->refcnt))
|
|
* return true; <- taken
|
|
*
|
|
* if (put(&p->ref)) { <-- Succeeds
|
|
* remove_pointer(p);
|
|
* kfree_rcu(p, rcu);
|
|
* }
|
|
*
|
|
* RCU grace period ends, object is freed
|
|
*
|
|
* atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF
|
|
*
|
|
* This is prevented by disabling preemption around the put() operation as
|
|
* that's in most kernel configurations cheaper than a rcu_read_lock() /
|
|
* rcu_read_unlock() pair and in many cases even a NOOP. In any case it
|
|
* prevents the grace period which keeps the object alive until all put()
|
|
* operations complete.
|
|
*
|
|
* Saturation protection
|
|
* =====================
|
|
*
|
|
* The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
|
|
* Once this is exceedded the reference count becomes stale by setting it
|
|
* to RCUREF_SATURATED, which will cause a memory leak, but it prevents
|
|
* wrap arounds which obviously cause worse problems than a memory
|
|
* leak. When saturation is reached a warning is emitted.
|
|
*
|
|
* Race conditions
|
|
* ===============
|
|
*
|
|
* All reference count increment/decrement operations are unconditional and
|
|
* only verified after the fact. This optimizes for the good case and takes
|
|
* the occasional race vs. a dead or already saturated refcount into
|
|
* account. The saturation and dead zones are large enough to accomodate
|
|
* for that.
|
|
*
|
|
* Memory ordering
|
|
* ===============
|
|
*
|
|
* Memory ordering rules are slightly relaxed wrt regular atomic_t functions
|
|
* and provide only what is strictly required for refcounts.
|
|
*
|
|
* The increments are fully relaxed; these will not provide ordering. The
|
|
* rationale is that whatever is used to obtain the object to increase the
|
|
* reference count on will provide the ordering. For locked data
|
|
* structures, its the lock acquire, for RCU/lockless data structures its
|
|
* the dependent load.
|
|
*
|
|
* rcuref_get() provides a control dependency ordering future stores which
|
|
* ensures that the object is not modified when acquiring a reference
|
|
* fails.
|
|
*
|
|
* rcuref_put() provides release order, i.e. all prior loads and stores
|
|
* will be issued before. It also provides a control dependency ordering
|
|
* against the subsequent destruction of the object.
|
|
*
|
|
* If rcuref_put() successfully dropped the last reference and marked the
|
|
* object DEAD it also provides acquire ordering.
|
|
*/
|
|
|
|
#include <linux/export.h>
|
|
#include <linux/rcuref.h>
|
|
|
|
/**
|
|
* rcuref_get_slowpath - Slowpath of rcuref_get()
|
|
* @ref: Pointer to the reference count
|
|
*
|
|
* Invoked when the reference count is outside of the valid zone.
|
|
*
|
|
* Return:
|
|
* False if the reference count was already marked dead
|
|
*
|
|
* True if the reference count is saturated, which prevents the
|
|
* object from being deconstructed ever.
|
|
*/
|
|
bool rcuref_get_slowpath(rcuref_t *ref)
|
|
{
|
|
unsigned int cnt = atomic_read(&ref->refcnt);
|
|
|
|
/*
|
|
* If the reference count was already marked dead, undo the
|
|
* increment so it stays in the middle of the dead zone and return
|
|
* fail.
|
|
*/
|
|
if (cnt >= RCUREF_RELEASED) {
|
|
atomic_set(&ref->refcnt, RCUREF_DEAD);
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* If it was saturated, warn and mark it so. In case the increment
|
|
* was already on a saturated value restore the saturation
|
|
* marker. This keeps it in the middle of the saturation zone and
|
|
* prevents the reference count from overflowing. This leaks the
|
|
* object memory, but prevents the obvious reference count overflow
|
|
* damage.
|
|
*/
|
|
if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
|
|
atomic_set(&ref->refcnt, RCUREF_SATURATED);
|
|
return true;
|
|
}
|
|
EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
|
|
|
|
/**
|
|
* rcuref_put_slowpath - Slowpath of __rcuref_put()
|
|
* @ref: Pointer to the reference count
|
|
*
|
|
* Invoked when the reference count is outside of the valid zone.
|
|
*
|
|
* Return:
|
|
* True if this was the last reference with no future references
|
|
* possible. This signals the caller that it can safely schedule the
|
|
* object, which is protected by the reference counter, for
|
|
* deconstruction.
|
|
*
|
|
* False if there are still active references or the put() raced
|
|
* with a concurrent get()/put() pair. Caller is not allowed to
|
|
* deconstruct the protected object.
|
|
*/
|
|
bool rcuref_put_slowpath(rcuref_t *ref)
|
|
{
|
|
unsigned int cnt = atomic_read(&ref->refcnt);
|
|
|
|
/* Did this drop the last reference? */
|
|
if (likely(cnt == RCUREF_NOREF)) {
|
|
/*
|
|
* Carefully try to set the reference count to RCUREF_DEAD.
|
|
*
|
|
* This can fail if a concurrent get() operation has
|
|
* elevated it again or the corresponding put() even marked
|
|
* it dead already. Both are valid situations and do not
|
|
* require a retry. If this fails the caller is not
|
|
* allowed to deconstruct the object.
|
|
*/
|
|
if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
|
|
return false;
|
|
|
|
/*
|
|
* The caller can safely schedule the object for
|
|
* deconstruction. Provide acquire ordering.
|
|
*/
|
|
smp_acquire__after_ctrl_dep();
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If the reference count was already in the dead zone, then this
|
|
* put() operation is imbalanced. Warn, put the reference count back to
|
|
* DEAD and tell the caller to not deconstruct the object.
|
|
*/
|
|
if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
|
|
atomic_set(&ref->refcnt, RCUREF_DEAD);
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* This is a put() operation on a saturated refcount. Restore the
|
|
* mean saturation value and tell the caller to not deconstruct the
|
|
* object.
|
|
*/
|
|
if (cnt > RCUREF_MAXREF)
|
|
atomic_set(&ref->refcnt, RCUREF_SATURATED);
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
|