mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
KVM: Block memslot updates across range_start() and range_end()
We would like to avoid taking mmu_lock for .invalidate_range_{start,end}() notifications that are unrelated to KVM. Because mmu_notifier_count must be modified while holding mmu_lock for write, and must always be paired across start->end to stay balanced, lock elision must happen in both or none. Therefore, in preparation for this change, this patch prevents memslot updates across range_start() and range_end(). Note, technically flag-only memslot updates could be allowed in parallel, but stalling a memslot update for a relatively short amount of time is not a scalability issue, and this is all more than complex enough. A long note on the locking: a previous version of the patch used an rwsem to block the memslot update while the MMU notifier run, but this resulted in the following deadlock involving the pseudo-lock tagged as "mmu_notifier_invalidate_range_start". ====================================================== WARNING: possible circular locking dependency detected 5.12.0-rc3+ #6 Tainted: G OE ------------------------------------------------------ qemu-system-x86/3069 is trying to acquire lock: ffffffff9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190 but task is already holding lock: ffffaff7410a9160 (&kvm->mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm] which lock already depends on the new lock. This corresponds to the following MMU notifier logic: invalidate_range_start take pseudo lock down_read() (*) release pseudo lock invalidate_range_end take pseudo lock (**) up_read() release pseudo lock At point (*) we take the mmu_notifiers_slots_lock inside the pseudo lock; at point (**) we take the pseudo lock inside the mmu_notifiers_slots_lock. This could cause a deadlock (ignoring for a second that the pseudo lock is not a lock): - invalidate_range_start waits on down_read(), because the rwsem is held by install_new_memslots - install_new_memslots waits on down_write(), because the rwsem is held till (another) invalidate_range_end finishes - invalidate_range_end sits waits on the pseudo lock, held by invalidate_range_start. Removing the fairness of the rwsem breaks the cycle (in lockdep terms, it would change the *shared* rwsem readers into *shared recursive* readers), so open-code the wait using a readers count and a spinlock. This also allows handling blockable and non-blockable critical section in the same way. Losing the rwsem fairness does theoretically allow MMU notifiers to block install_new_memslots forever. Note that mm/mmu_notifier.c's own retry scheme in mmu_interval_read_begin also uses wait/wake_up and is likewise not fair. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
parent
db105fab8d
commit
52ac8b358b
@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
|
||||
can be taken inside a kvm->srcu read-side critical section,
|
||||
while kvm->slots_lock cannot.
|
||||
|
||||
- kvm->mn_active_invalidate_count ensures that pairs of
|
||||
invalidate_range_start() and invalidate_range_end() callbacks
|
||||
use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock
|
||||
are taken on the waiting side in install_new_memslots, so MMU notifiers
|
||||
must not take either kvm->slots_lock or kvm->slots_arch_lock.
|
||||
|
||||
On x86:
|
||||
|
||||
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
|
||||
|
@ -548,6 +548,11 @@ struct kvm {
|
||||
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
|
||||
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
|
||||
|
||||
/* Used to wait for completion of MMU notifiers. */
|
||||
spinlock_t mn_invalidate_lock;
|
||||
unsigned long mn_active_invalidate_count;
|
||||
struct rcuwait mn_memslots_update_rcuwait;
|
||||
|
||||
/*
|
||||
* created_vcpus is protected by kvm->lock, and is incremented
|
||||
* at the beginning of KVM_CREATE_VCPU. online_vcpus is only
|
||||
|
@ -604,11 +604,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
|
||||
trace_kvm_set_spte_hva(address);
|
||||
|
||||
/*
|
||||
* .change_pte() must be surrounded by .invalidate_range_{start,end}(),
|
||||
* and so always runs with an elevated notifier count. This obviates
|
||||
* the need to bump the sequence count.
|
||||
* .change_pte() must be surrounded by .invalidate_range_{start,end}().
|
||||
*/
|
||||
WARN_ON_ONCE(!kvm->mmu_notifier_count);
|
||||
WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
|
||||
|
||||
kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
|
||||
}
|
||||
@ -658,6 +656,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
||||
|
||||
trace_kvm_unmap_hva_range(range->start, range->end);
|
||||
|
||||
/*
|
||||
* Prevent memslot modification between range_start() and range_end()
|
||||
* so that conditionally locking provides the same result in both
|
||||
* functions. Without that guarantee, the mmu_notifier_count
|
||||
* adjustments will be imbalanced.
|
||||
*
|
||||
* Pairs with the decrement in range_end().
|
||||
*/
|
||||
spin_lock(&kvm->mn_invalidate_lock);
|
||||
kvm->mn_active_invalidate_count++;
|
||||
spin_unlock(&kvm->mn_invalidate_lock);
|
||||
|
||||
__kvm_handle_hva_range(kvm, &hva_range);
|
||||
|
||||
return 0;
|
||||
@ -694,9 +704,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
|
||||
.flush_on_ret = false,
|
||||
.may_block = mmu_notifier_range_blockable(range),
|
||||
};
|
||||
bool wake;
|
||||
|
||||
__kvm_handle_hva_range(kvm, &hva_range);
|
||||
|
||||
/* Pairs with the increment in range_start(). */
|
||||
spin_lock(&kvm->mn_invalidate_lock);
|
||||
wake = (--kvm->mn_active_invalidate_count == 0);
|
||||
spin_unlock(&kvm->mn_invalidate_lock);
|
||||
|
||||
/*
|
||||
* There can only be one waiter, since the wait happens under
|
||||
* slots_lock.
|
||||
*/
|
||||
if (wake)
|
||||
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
|
||||
|
||||
BUG_ON(kvm->mmu_notifier_count < 0);
|
||||
}
|
||||
|
||||
@ -977,6 +1000,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
|
||||
mutex_init(&kvm->irq_lock);
|
||||
mutex_init(&kvm->slots_lock);
|
||||
mutex_init(&kvm->slots_arch_lock);
|
||||
spin_lock_init(&kvm->mn_invalidate_lock);
|
||||
rcuwait_init(&kvm->mn_memslots_update_rcuwait);
|
||||
|
||||
INIT_LIST_HEAD(&kvm->devices);
|
||||
|
||||
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
|
||||
@ -1099,6 +1125,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
|
||||
kvm_coalesced_mmio_free(kvm);
|
||||
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
|
||||
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
|
||||
/*
|
||||
* At this point, pending calls to invalidate_range_start()
|
||||
* have completed but no more MMU notifiers will run, so
|
||||
* mn_active_invalidate_count may remain unbalanced.
|
||||
* No threads can be waiting in install_new_memslots as the
|
||||
* last reference on KVM has been dropped, but freeing
|
||||
* memslots would deadlock without this manual intervention.
|
||||
*/
|
||||
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
|
||||
kvm->mn_active_invalidate_count = 0;
|
||||
#else
|
||||
kvm_arch_flush_shadow_all(kvm);
|
||||
#endif
|
||||
@ -1360,7 +1396,21 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
|
||||
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
|
||||
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
|
||||
|
||||
/*
|
||||
* Do not store the new memslots while there are invalidations in
|
||||
* progress (preparatory change for the next commit).
|
||||
*/
|
||||
spin_lock(&kvm->mn_invalidate_lock);
|
||||
prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
|
||||
while (kvm->mn_active_invalidate_count) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
spin_unlock(&kvm->mn_invalidate_lock);
|
||||
schedule();
|
||||
spin_lock(&kvm->mn_invalidate_lock);
|
||||
}
|
||||
finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
|
||||
rcu_assign_pointer(kvm->memslots[as_id], slots);
|
||||
spin_unlock(&kvm->mn_invalidate_lock);
|
||||
|
||||
/*
|
||||
* Acquired in kvm_set_memslot. Must be released before synchronize
|
||||
|
Loading…
Reference in New Issue
Block a user