diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d..303bf74d175b 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 569125726b3e..d7ba8795d60f 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -350,8 +350,11 @@ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask, domain = dev_get_msi_domain(&pdev->dev); - if (!domain || !irq_domain_is_hierarchy(domain)) - return mode == ALLOW_LEGACY; + if (!domain || !irq_domain_is_hierarchy(domain)) { + if (IS_ENABLED(CONFIG_PCI_MSI_ARCH_FALLBACKS)) + return mode == ALLOW_LEGACY; + return false; + } if (!irq_domain_is_msi_parent(domain)) { /* diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 3a45879d85db..2f647cac4cae 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -433,6 +433,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (WARN_ON_ONCE(dev->msi_enabled)) return -EINVAL; + /* Test for the availability of MSI support */ + if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY)) + return -ENOTSUPP; + nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; diff --git a/drivers/virt/coco/tdx-guest/tdx-guest.c b/drivers/virt/coco/tdx-guest/tdx-guest.c index d7db6c824e13..224e7dde9cde 100644 --- a/drivers/virt/coco/tdx-guest/tdx-guest.c +++ b/drivers/virt/coco/tdx-guest/tdx-guest.c @@ -124,10 +124,8 @@ static void *alloc_quote_buf(void) if (!addr) return NULL; - if (set_memory_decrypted((unsigned long)addr, count)) { - free_pages_exact(addr, len); + if (set_memory_decrypted((unsigned long)addr, count)) return NULL; - } return addr; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..6c986491c60a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report the frozen task uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e858de203eb6..697a56d3d949 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * * Must be called with lock->wait_lock held and interrupts disabled */ @@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); @@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); if (likely(!ret)) { /* acquired the lock */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 33ea31d6a7b3..191e4720e546 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up.