From b9c44b91476b67327a521568a854babecc4070ab Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:07 -0700 Subject: [PATCH 001/224] perf/core: Save raw sample data conditionally based on sample type Currently, space for raw sample data is always allocated within sample records for both BPF output and tracepoint events. This leads to unused space in sample records when raw sample data is not requested. This patch enforces checking sample type of an event in perf_sample_save_raw_data(). So raw sample data will only be saved if explicitly requested, reducing overhead when it is not needed. Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper") Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-2-yabinc@google.com --- arch/s390/kernel/perf_cpum_cf.c | 2 +- arch/s390/kernel/perf_pai_crypto.c | 2 +- arch/s390/kernel/perf_pai_ext.c | 2 +- arch/x86/events/amd/ibs.c | 2 +- include/linux/perf_event.h | 6 +++++ kernel/events/core.c | 35 +++++++++++++++--------------- kernel/trace/bpf_trace.c | 11 +++++----- 7 files changed, 34 insertions(+), 26 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index e2e0aa463fbd..c3075e4a8efc 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index fa7325454266..10725f5a6f0f 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 7f462bef1fc0..a8f0bad99cf0 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e91970b01d62..c3a2f6f57770 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1118,7 +1118,7 @@ fail: .data = ibs_data.data, }, }; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } if (perf_ibs == &perf_ibs_op) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cb99ec8c9e96..f7c0a3f2f502 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1287,12 +1287,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, + struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) + return; + do { sum += frag->size; if (perf_raw_frag_last(frag)) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1869164a4e99..3670b91f182f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10451,9 +10451,9 @@ static struct pmu perf_tracepoint = { }; static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -10465,7 +10465,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -10476,7 +10476,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -10502,6 +10502,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; @@ -10511,13 +10512,17 @@ static void __perf_tp_event_target_task(u64 count, void *record, /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; - if (perf_tp_event_match(event, data, regs)) + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); + } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); @@ -10525,15 +10530,15 @@ static void perf_tp_event_target_task(u64 count, void *record, struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } @@ -10551,15 +10556,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) { - perf_swevent_event(event, count, &data, regs); - + if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and @@ -10569,7 +10569,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); + perf_swevent_event(event, count, &data, regs); } } @@ -10586,7 +10587,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; raw_spin_lock(&ctx->lock); - perf_tp_event_target_task(count, record, regs, &data, ctx); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fdab7ecd8dfa..162bacf8aa5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -619,7 +619,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -644,6 +645,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -687,9 +690,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -748,9 +750,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); From f226805bc5f60adf03783d8e4cbfe303ccecd64e Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:08 -0700 Subject: [PATCH 002/224] perf/core: Check sample_type in perf_sample_save_callchain Check sample_type in perf_sample_save_callchain() to prevent saving callchain data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-3-yabinc@google.com --- arch/x86/events/amd/ibs.c | 3 +-- arch/x86/events/intel/ds.c | 6 ++---- include/linux/perf_event.h | 5 +++++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c3a2f6f57770..f02939655b2a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1129,8 +1129,7 @@ fail: * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(&data, event, iregs); + perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8afc4ad3cd16..4990a2409807 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); /* * We use the interrupt regs as a base because the PEBS record does not @@ -1957,8 +1956,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f7c0a3f2f502..3ac202d971fb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; From faac6f105ef169e2e5678c14e1ffebf2a7d780b6 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:09 -0700 Subject: [PATCH 003/224] perf/core: Check sample_type in perf_sample_save_brstack Check sample_type in perf_sample_save_brstack() to prevent saving branch stack data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-4-yabinc@google.com --- arch/x86/events/amd/core.c | 3 +-- arch/x86/events/core.c | 3 +-- arch/x86/events/intel/ds.c | 3 +-- include/linux/perf_event.h | 15 ++++++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b4a1a2576510..30d6ceb4c8ad 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c75c482d4c52..8f218ac0d445 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4990a2409807..acfd87207547 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1888,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 3) setup_pebs_time(event, data, pebs->tsc); - if (has_branch_stack(event)) - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } static void adaptive_pebs_save_regs(struct pt_regs *regs, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3ac202d971fb..bf831b1485ff 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1320,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1327,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); @@ -1716,11 +1726,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; From 209954cbc7d0ce1a190fc725d20ce303d74d2680 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:16 -0500 Subject: [PATCH 004/224] x86/mm/tlb: Update mm_cpumask lazily On busy multi-threaded workloads, there can be significant contention on the mm_cpumask at context switch time. Reduce that contention by updating mm_cpumask lazily, setting the CPU bit at context switch time (if not already set), and clearing the CPU bit at the first TLB flush sent to a CPU where the process isn't running. When a flurry of TLB flushes for a process happen, only the first one will be sent to CPUs where the process isn't running. The others will be sent to CPUs where the process is currently running. On an AMD Milan system with 36 cores, there is a noticeable difference: $ hackbench --groups 20 --loops 10000 Before: ~4.5s +/- 0.1s After: ~4.2s +/- 0.1s Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mel Gorman Link: https://lore.kernel.org/r/20241114152723.1294686-2-riel@surriel.com --- arch/x86/kernel/alternative.c | 10 +++++++--- arch/x86/mm/tlb.c | 19 +++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d17518ca19b8..8b66a555d2f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1825,11 +1825,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) return temp_state; } +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); + /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. @@ -1838,9 +1845,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) hw_breakpoint_restore(); } -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b0d5a644fc84..cc4e57ae690f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -606,18 +606,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Stop remote flushes for the previous mm. - * Skip kernel threads; we never send init_mm TLB flushing IPIs, - * but the bitmap manipulation can cause cache line contention. + * Leave this CPU in prev's mm_cpumask. Atomic writes to + * mm_cpumask can be expensive under contention. The CPU + * will be removed lazily at TLB flush time. */ - if (prev != &init_mm) { - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(prev))); - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - } + VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, + mm_cpumask(prev))); /* Start receiving IPIs and then read tlb_gen (and LAM below) */ - if (next != &init_mm) + if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -761,8 +758,10 @@ static void flush_tlb_func(void *info) count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); /* Can only happen on remote CPUs */ - if (f->mm && f->mm != loaded_mm) + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); return; + } } if (unlikely(loaded_mm == &init_mm)) From 2815a56e4b7252a836969f5674ee356ea1ce482c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:17 -0500 Subject: [PATCH 005/224] x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU Add a tracepoint when we send a TLB flush IPI to a CPU that used to be in the mm_cpumask, but isn't any more. Suggested-by: Dave Hansen Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241114152723.1294686-3-riel@surriel.com --- arch/x86/mm/tlb.c | 1 + include/linux/mm_types.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index cc4e57ae690f..1aac4fa90d3d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -760,6 +760,7 @@ static void flush_tlb_func(void *info) /* Can only happen on remote CPUs */ if (f->mm && f->mm != loaded_mm) { cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); return; } } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..6b6f05404304 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1335,6 +1335,7 @@ enum tlb_flush_reason { TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, + TLB_REMOTE_WRONG_CPU, NR_TLB_FLUSH_REASONS, }; From 108ad0999085df2366dd9ef437573955cb3f5586 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Mon, 18 Nov 2024 13:37:45 +0900 Subject: [PATCH 006/224] sched: Don't try to catch up excess steal time. When steal time exceeds the measured delta when updating clock_task, we currently try to catch up the excess in future updates. However, this results in inaccurate run times for the future things using clock_task, in some situations, as they end up getting additional steal time that did not actually happen. This is because there is a window between reading the elapsed time in update_rq_clock() and sampling the steal time in update_rq_clock_task(). If the VCPU gets preempted between those two points, any additional steal time is accounted to the outgoing task even though the calculated delta did not actually contain any of that "stolen" time. When this race happens, we can end up with steal time that exceeds the calculated delta, and the previous code would try to catch up that excess steal time in future clock updates, which is given to the next, incoming task, even though it did not actually have any time stolen. This behavior is particularly bad when steal time can be very long, which we've seen when trying to extend steal time to contain the duration that the host was suspended [0]. When this happens, clock_task stays frozen, during which the running task stays running for the whole duration, since its run time doesn't increase. However the race can happen even under normal operation. Ideally we would read the elapsed cpu time and the steal time atomically, to prevent this race from happening in the first place, but doing so is non-trivial. Since the time between those two points isn't otherwise accounted anywhere, neither to the outgoing task nor the incoming task (because the "end of outgoing task" and "start of incoming task" timestamps are the same), I would argue that the right thing to do is to simply drop any excess steal time, in order to prevent these issues. [0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/ Signed-off-by: Suleiman Souhlal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c6d8232ad9ee..4ffaef81db42 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -766,13 +766,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif From 59297e2093ceced86393a059a4bd36802311f7bb Mon Sep 17 00:00:00 2001 From: Harshit Agarwal Date: Thu, 14 Nov 2024 14:08:11 -0700 Subject: [PATCH 007/224] sched: add READ_ONCE to task_on_rq_queued task_on_rq_queued read p->on_rq without READ_ONCE, though p->on_rq is set with WRITE_ONCE in {activate|deactivate}_task and smp_store_release in __block_task, and also read with READ_ONCE in task_on_rq_migrating. Make all of these accesses pair together by adding READ_ONCE in the task_on_rq_queued. Signed-off-by: Harshit Agarwal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20241114210812.1836587-1-jon@nutanix.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 76f5f53a645f..0f6790c5279b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2271,7 +2271,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) From 41d4200b7103152468552ee50998cda914102049 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 14 Nov 2024 14:28:09 +0000 Subject: [PATCH 008/224] sched/deadline: Restore dl_server bandwidth on non-destructive root domain changes When root domain non-destructive changes (e.g., only modifying one of the existing root domains while the rest is not touched) happen we still need to clear DEADLINE bandwidth accounting so that it's then properly restored, taking into account DEADLINE tasks associated to each cpuset (associated to each root domain). After the introduction of dl_servers, we fail to restore such servers contribution after non-destructive changes (as they are only considered on destructive changes when runqueues are attached to the new domains). Fix this by making sure we iterate over the dl_servers attached to domains that have not been destroyed and add their bandwidth contribution back correctly. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Waiman Long Link: https://lore.kernel.org/r/20241114142810.794657-2-juri.lelli@redhat.com --- kernel/sched/deadline.c | 17 ++++++++++++++--- kernel/sched/topology.c | 8 +++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index db47f33cb7d2..ff68ce4a7b79 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2960,11 +2960,22 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_server bandwidth is only restored when CPUs are attached to root + * domains (after domains are created or CPUs moved back to the + * default root doamin). + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + rd->dl_bw.total_bw += dl_se->dl_bw; + } } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9748a4c8d668..9c405f0e7b26 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2721,9 +2721,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], /* * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). + * its dl_bw->total_bw needs to be cleared. + * Tasks contribution will be then recomputed + * in function dl_update_tasks_root_domain(), + * dl_servers contribution in function + * dl_restore_server_root_domain(). */ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; dl_clear_root_domain(rd); From d4742f6ed7ea6df56e381f82ba4532245fa1e561 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 14 Nov 2024 14:28:10 +0000 Subject: [PATCH 009/224] sched/deadline: Correctly account for allocated bandwidth during hotplug For hotplug operations, DEADLINE needs to check that there is still enough bandwidth left after removing the CPU that is going offline. We however fail to do so currently. Restore the correct behavior by restructuring dl_bw_manage() a bit, so that overflow conditions (not enough bandwidth left) are properly checked. Also account for dl_server bandwidth, i.e. discount such bandwidth in the calculation since NORMAL tasks will be anyway moved away from the CPU as a result of the hotplug operation. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Waiman Long Link: https://lore.kernel.org/r/20241114142810.794657-3-juri.lelli@redhat.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 48 +++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 2 +- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ffaef81db42..29f6b2475fdb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8185,7 +8185,7 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_check_overflow(cpu); + int ret = dl_bw_deactivate(cpu); if (ret) return ret; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ff68ce4a7b79..fa787c7018a4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3460,29 +3460,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3491,6 +3493,34 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. + */ + if (dl_bw_cpus(cpu)) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3499,9 +3529,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0f6790c5279b..5eb2d5b9722f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); +extern int dl_bw_deactivate(int cpu); extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following From 53916d5fd3c0b658de3463439dd2b7ce765072cb Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 15 Nov 2024 11:48:29 +0000 Subject: [PATCH 010/224] sched/deadline: Check bandwidth overflow earlier for hotplug Currently we check for bandwidth overflow potentially due to hotplug operations at the end of sched_cpu_deactivate(), after the cpu going offline has already been removed from scheduling, active_mask, etc. This can create issues for DEADLINE tasks, as there is a substantial race window between the start of sched_cpu_deactivate() and the moment we possibly decide to roll-back the operation if dl_bw_deactivate() returns failure in cpuset_cpu_inactive(). An example is a throttled task that sees its replenishment timer firing while the cpu it was previously running on is considered offline, but before dl_bw_deactivate() had a chance to say no and roll-back happened. Fix this by directly calling dl_bw_deactivate() first thing in sched_cpu_deactivate() and do the required calculation in the former function considering the cpu passed as an argument as offline already. By doing so we also simplify sched_cpu_deactivate(), as there is no need anymore for any kind of roll-back if we fail early. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Waiman Long Link: https://lore.kernel.org/r/Zzc1DfPhbvqDDIJR@jlelli-thinkpadt14gen4.remote.csb --- kernel/sched/core.c | 22 +++++++--------------- kernel/sched/deadline.c | 12 ++++++++++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29f6b2475fdb..1dee3f5ef940 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8182,19 +8182,14 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_deactivate(cpu); - - if (ret) - return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); } - return 0; } static inline void sched_smt_present_inc(int cpu) @@ -8256,6 +8251,11 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + /* * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active @@ -8301,15 +8301,7 @@ int sched_cpu_deactivate(unsigned int cpu) return 0; sched_update_numa(cpu, false); - ret = cpuset_cpu_inactive(cpu); - if (ret) { - sched_smt_present_inc(cpu); - sched_set_rq_online(rq, cpu); - balance_push_set(cpu, false); - set_cpu_active(cpu, true); - sched_update_numa(cpu, true); - return ret; - } + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fa787c7018a4..1c8b8381dd20 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3495,6 +3495,13 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) } break; case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + /* * cpu is going offline and NORMAL tasks will be moved away * from it. We can thus discount dl_server bandwidth @@ -3512,9 +3519,10 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) if (dl_b->total_bw - fair_server_bw > 0) { /* * Leaving at least one CPU for DEADLINE tasks seems a - * wise thing to do. + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. */ - if (dl_bw_cpus(cpu)) + if (dl_bw_cpus(cpu) - 1) overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); else overflow = 1; From 3a181f20fb4e9ad3c93ea6c71520c23826042629 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Wed, 24 Jul 2024 11:22:48 -0300 Subject: [PATCH 011/224] sched/deadline: Consolidate Timer Cancellation After commit b58652db66c9 ("sched/deadline: Fix task_struct reference leak"), I identified additional calls to hrtimer_try_to_cancel that might also require a dl_server check. It remains unclear whether this omission was intentional or accidental in those contexts. This patch consolidates the timer cancellation logic into dedicated functions, ensuring consistent behavior across all calls. Additionally, it reduces code duplication and improves overall code cleanliness. Note the use of the __always_inline keyword. In some instances, we have a task_struct pointer, dereference the dl member, and then use the container_of macro to retrieve the task_struct pointer again. By inlining the code, the compiler can potentially optimize out this redundant round trip. Signed-off-by: Wander Lairson Costa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20240724142253.27145-3-wander@redhat.com --- kernel/sched/deadline.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1c8b8381dd20..33b4646f8b24 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + static void dl_change_utilization(struct task_struct *p, u64 new_bw) { WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); @@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -2113,13 +2133,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). - * - * If the timer callback was running (hrtimer_try_to_cancel == -1), - * it will eventually call put_task_struct(). */ - if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && - !dl_server(&p->dl)) - put_task_struct(p); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -2287,8 +2302,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -3036,8 +3050,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep From a76328d44c7ab7d1001a97cb2e84506dde7822d4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 27 Nov 2024 17:55:01 +0100 Subject: [PATCH 012/224] sched/fair: Remove CONFIG_CFS_BANDWIDTH=n definition of cfs_bandwidth_used() Andy reported that clang gets upset with CONFIG_CFS_BANDWIDTH=n: kernel/sched/fair.c:6580:20: error: unused function 'cfs_bandwidth_used' [-Werror,-Wunused-function] 6580 | static inline bool cfs_bandwidth_used(void) | ^~~~~~~~~~~~~~~~~~ Indeed, cfs_bandwidth_used() is only used within functions defined under CONFIG_CFS_BANDWIDTH=y. Remove its CONFIG_CFS_BANDWIDTH=n declaration & definition. Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241127165501.160004-1-vschneid@redhat.com --- kernel/sched/fair.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 05b8f1eb2c14..4283c818bbd1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5373,8 +5373,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -static inline bool cfs_bandwidth_used(void); - static void requeue_delayed_entity(struct sched_entity *se); @@ -6748,11 +6746,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* CONFIG_CFS_BANDWIDTH */ -static inline bool cfs_bandwidth_used(void) -{ - return false; -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} From 7087bfb0adc9a12ec3b463b1d38072c5efce5d6c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:02 -0800 Subject: [PATCH 013/224] perf/x86/intel/ds: Clarify adaptive PEBS processing Modify the pebs_basic and pebs_meminfo structs to make the bitfields more explicit to ease readability of the code. Co-developed-by: Stephane Eranian Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-3-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 43 ++++++++++++++----------------- arch/x86/include/asm/perf_event.h | 16 ++++++++++-- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 34cba39f6e70..450f318d3219 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1915,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, } #define PEBS_LATENCY_MASK 0xffff -#define PEBS_CACHE_LATENCY_OFFSET 32 -#define PEBS_RETIRE_LATENCY_OFFSET 32 /* * With adaptive PEBS the layout depends on what fields are configured. @@ -1930,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type; - u64 format_size; + u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; @@ -1943,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs->xmm_regs = NULL; sample_type = event->attr.sample_type; - format_size = basic->format_size; + format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; @@ -1964,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; + data->weight.var3_w = basic->retire_latency; else data->weight.var3_w = 0; } @@ -1974,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * But PERF_SAMPLE_TRANSACTION needs gprs->ax. * Save the pointer here but process later. */ - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { meminfo = next_record; next_record = meminfo + 1; } - if (format_size & PEBS_DATACFG_GP) { + if (format_group & PEBS_DATACFG_GP) { gprs = next_record; next_record = gprs + 1; @@ -1992,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, adaptive_pebs_save_regs(regs, gprs); } - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 weight = meminfo->latency; + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { - data->weight.var2_w = weight & PEBS_LATENCY_MASK; - weight >>= PEBS_CACHE_LATENCY_OFFSET; - } + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) + data->weight.var2_w = meminfo->instr_latency; /* * Although meminfo::latency is defined as a u64, @@ -2007,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * in practice on Ice Lake and earlier platforms. */ if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = weight ?: + data->weight.full = latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } else { - data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + data->weight.var1_dw = (u32)latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } @@ -2033,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - if (format_size & PEBS_DATACFG_XMMS) { + if (format_group & PEBS_DATACFG_XMMS) { struct pebs_xmm *xmm = next_record; next_record = xmm + 1; perf_regs->xmm_regs = xmm->xmm; } - if (format_size & PEBS_DATACFG_LBRS) { + if (format_group & PEBS_DATACFG_LBRS) { struct lbr_entry *lbr = next_record; - int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; next_record = next_record + num_lbr * sizeof(struct lbr_entry); @@ -2052,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - WARN_ONCE(next_record != __pebs + (format_size >> 48), - "PEBS record size %llu, expected %llu, config %llx\n", - format_size >> 48, + WARN_ONCE(next_record != __pebs + basic->format_size, + "PEBS record size %u, expected %llu, config %llx\n", + basic->format_size, (u64)(next_record - __pebs), - basic->format_size); + format_group); } static inline void * diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d95f902acc52..cb9c4679f45c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -422,7 +422,9 @@ static inline bool is_topdown_idx(int idx) */ struct pebs_basic { - u64 format_size; + u64 format_group:32, + retire_latency:16, + format_size:16; u64 ip; u64 applicable_counters; u64 tsc; @@ -431,7 +433,17 @@ struct pebs_basic { struct pebs_meminfo { u64 address; u64 aux; - u64 latency; + union { + /* pre Alder Lake */ + u64 mem_latency; + /* Alder Lake and later */ + struct { + u64 instr_latency:16; + u64 pad2:16; + u64 cache_latency:16; + u64 pad3:16; + }; + }; u64 tsx_tuning; }; From 3c00ed344cef4dbb57d8769b961af414132a173a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:03 -0800 Subject: [PATCH 014/224] perf/x86/intel/ds: Factor out functions for PEBS records processing Factor out functions to process normal and the last PEBS records, which can be shared with the later patch. Move the event updating related codes (intel_pmu_save_and_restart()) to the end, where all samples have been processed. For the current usage, it doesn't matter when perf updates event counts and reset the counter. Because all counters are stopped when the PEBS buffer is drained. Drop the return of the !intel_pmu_save_and_restart(event) check. Because it never happen. The intel_pmu_save_and_restart(event) only returns 0, when !hwc->event_base or the period_left > 0. - The !hwc->event_base is impossible for the PEBS event, since the PEBS event is only available on GP and fixed counters, which always have a valid hwc->event_base. - The check only happens for the case of non-AUTO_RELOAD and single PEBS, which implies that the event must be overflowed. The period_left must be always <= 0 for an overflowed event after the x86_pmu_update(). Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-4-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 109 +++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 450f318d3219..79a3467c747b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2164,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } +typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *, + struct perf_sample_data *, struct pt_regs *); + +static struct pt_regs dummy_iregs; + static __always_inline void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, + struct pt_regs *regs, struct perf_sample_data *data, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) + void *at, + setup_fn setup_sample) +{ + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); +} + +static __always_inline void +__intel_pmu_pebs_last_event(struct perf_event *event, + struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, + int count, + setup_fn setup_sample) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct x86_perf_regs perf_regs; - struct pt_regs *regs = &perf_regs.regs; - void *at = get_next_pebs_record_by_bit(base, top, bit); - static struct pt_regs dummy_iregs; - - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); - } else if (!intel_pmu_save_and_restart(event)) - return; - - if (!iregs) - iregs = &dummy_iregs; - - while (count > 1) { - setup_sample(event, iregs, at, data, regs); - perf_event_output(event, data, regs); - at += cpuc->pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - count--; - } setup_sample(event, iregs, at, data, regs); if (iregs == &dummy_iregs) { @@ -2222,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event, if (perf_event_overflow(event, data, regs)) x86_pmu_stop(event, 0); } + + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else + intel_pmu_save_and_restart(event); +} + +static __always_inline void +__intel_pmu_pebs_events(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *at = get_next_pebs_record_by_bit(base, top, bit); + int cnt = count; + + if (!iregs) + iregs = &dummy_iregs; + + while (cnt > 1) { + __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample); + at += cpuc->pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + cnt--; + } + + __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) @@ -2258,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ return; } - __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n, + setup_pebs_fixed_sample_data); } static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) @@ -2390,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } @@ -2444,9 +2469,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (WARN_ON_ONCE(!event->attr.precise_ip)) continue; - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); } } From ae55e308bde2267df79c4475daa85e174b7ab4c8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:04 -0800 Subject: [PATCH 015/224] perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS The current code may iterate all the PEBS records in the DS area several times. The first loop is to find all active events and calculate the available records for each event. Then iterate the whole buffer again and again to process available records until all active events are processed. The algorithm is inherited from the old generations. The old PEBS hardware does not deal well with the situation when events happen near each other. SW has to drop the error records. Multiple iterations are required. The hardware limit has been addressed on newer platforms with adaptive PEBS. A simple one-iteration algorithm is introduced. The samples are output by record order with the patch, rather than the event order. It doesn't impact the post-processing. The perf tool always sorts the records by time before presenting them to the end user. In an NMI, the last record has to be specially handled. Add a last[] variable to track the last unprocessed record of each event. Test: 11 PEBS events are used in the perf test. Only the basic information is collected. perf record -e instructions:up,...,instructions:up -c 2000003 benchmark The ftrace is used to record the duration of the intel_pmu_drain_pebs_icl(). The average duration reduced from 62.04us to 57.94us. A small improvement can be observed with the new algorithm. Also, the implementation becomes simpler and more straightforward. Suggested-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20241119135504.1463839-5-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 43 +++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 79a3467c747b..8dcf90f6fb59 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2425,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + struct pebs_basic *basic; struct perf_event *event; void *base, *at, *top; int bit; @@ -2448,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d return; } - for (at = base; at < top; at += cpuc->pebs_record_size) { + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top; at += basic->format_size) { u64 pebs_status; - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; - pebs_status &= mask; + basic = at; + if (basic->format_size != cpuc->pebs_record_size) + continue; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) - counts[bit]++; + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], + setup_pebs_adaptive_sample_data); + } + last[bit] = at; + } } for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (counts[bit] == 0) + if (!counts[bit]) continue; event = cpuc->events[bit]; - if (WARN_ON_ONCE(!event)) - continue; - if (WARN_ON_ONCE(!event->attr.precise_ip)) - continue; - - __intel_pmu_pebs_events(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_pebs_adaptive_sample_data); } } From 2f2db347071a8736c2adcdbf2658ce532e0afc0a Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:07:57 +0000 Subject: [PATCH 016/224] perf/x86/rapl: Remove the unused get_rapl_pmu_cpumask() function commit 9e9af8bbb5f9 ("perf/x86/rapl: Clean up cpumask and hotplug") removes the cpumask handling from rapl. Post that, we no longer need the get_rapl_pmu_cpumask() function. So remove it. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-2-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index a8defc813c36..f70c49ca0ef3 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -153,7 +153,7 @@ static u64 rapl_timer_ms; static struct perf_msr *rapl_msrs; /* - * Helper functions to get the correct topology macros according to the + * Helper function to get the correct topology id according to the * RAPL PMU scope. */ static inline unsigned int get_rapl_pmu_idx(int cpu) @@ -162,12 +162,6 @@ static inline unsigned int get_rapl_pmu_idx(int cpu) topology_logical_die_id(cpu); } -static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) -{ - return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : - topology_die_cpumask(cpu); -} - static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); From e4b444347795a1ecc083895582bc2e7f288a22e4 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 15 Nov 2024 06:07:58 +0000 Subject: [PATCH 017/224] x86/topology: Introduce topology_logical_core_id() On x86, topology_core_id() returns a unique core ID within the PKG domain. Looking at match_smt() suggests that a core ID just needs to be unique within a LLC domain. For use cases such as the core RAPL PMU, there exists a need for a unique core ID across the entire system with multiple PKG domains. Introduce topology_logical_core_id() to derive a unique core ID across the system. Signed-off-by: K Prateek Nayak Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Zhang Rui Reviewed-by: "Gautham R. Shenoy" Tested-by: K Prateek Nayak Tested-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20241115060805.447565-3-Dhananjay.Ugwekar@amd.com --- Documentation/arch/x86/topology.rst | 4 ++++ arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/topology.h | 1 + arch/x86/kernel/cpu/debugfs.c | 1 + arch/x86/kernel/cpu/topology_common.c | 1 + 5 files changed, 8 insertions(+) diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 7352ab89a55a..c12837e61bda 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -135,6 +135,10 @@ Thread-related topology information in the kernel: The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo "core_id." + - topology_logical_core_id(); + + The logical core ID to which a thread belongs. + System topology examples diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0975815980c..cfd8a5591421 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -98,6 +98,7 @@ struct cpuinfo_topology { // Logical ID mappings u32 logical_pkg_id; u32 logical_die_id; + u32 logical_core_id; // AMD Node ID and Nodes per Package info u32 amd_node_id; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index fd41103ad342..3973cb9bb2e6 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 10719aba6276..cacfd3f6abef 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 8277c64f88db..b5a5e1411469 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) if (!early) { c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); } /* Package relative core ID */ From 1d5e2f637a94a8ca8c8a1e292dd98ee80aa92815 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:07:59 +0000 Subject: [PATCH 018/224] perf/x86/rapl: Remove the cpu_to_rapl_pmu() function Prepare for the addition of RAPL core energy counter support. Post which, one CPU might be mapped to more than one rapl_pmu (package/die one and a core one). So, remove the cpu_to_rapl_pmu() function. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-4-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index f70c49ca0ef3..bf260f4a5800 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -158,21 +158,15 @@ static struct perf_msr *rapl_msrs; */ static inline unsigned int get_rapl_pmu_idx(int cpu) { + /* + * Returns unsigned int, which converts the '-1' return value + * (for non-existent mappings in topology map) to UINT_MAX, so + * the error check in the caller is simplified. + */ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : topology_logical_die_id(cpu); } -static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) -{ - unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); - - /* - * The unsigned check also catches the '-1' return value for non - * existent mappings in the topology map. - */ - return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; -} - static inline u64 rapl_read_counter(struct perf_event *event) { u64 raw; @@ -350,6 +344,7 @@ static int rapl_pmu_event_init(struct perf_event *event) u64 cfg = event->attr.config & RAPL_EVENT_MASK; int bit, ret = 0; struct rapl_pmu *pmu; + unsigned int rapl_pmu_idx; /* only look at RAPL events */ if (event->attr.type != rapl_pmus->pmu.type) @@ -376,8 +371,12 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->attr.sample_period) /* no sampling */ return -EINVAL; + rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + return -EINVAL; + /* must be done before validate_group */ - pmu = cpu_to_rapl_pmu(event->cpu); + pmu = rapl_pmus->pmus[rapl_pmu_idx]; if (!pmu) return -EINVAL; event->pmu_private = pmu; From 8bf1c86e5ac828d7e8b44fe007bf3b14ac7f2b2d Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:00 +0000 Subject: [PATCH 019/224] perf/x86/rapl: Rename rapl_pmu variables Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to avoid any confusion between the variables of two different structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to struct pmu, which leads to situations in code like pmu->pmu, which is needlessly confusing. Above scenario is replaced with much more readable rapl_pmu->pmu with this change. Also rename "pmus" member in rapl_pmus struct, for same reason. No functional change. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-5-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 91 +++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index bf260f4a5800..9b1ec8a80241 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -129,7 +129,7 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; unsigned int nr_rapl_pmu; - struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); + struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); }; enum rapl_unit_quirk { @@ -228,34 +228,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); struct perf_event *event; unsigned long flags; - if (!pmu->n_active) + if (!rapl_pmu->n_active) return HRTIMER_NORESTART; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); - list_for_each_entry(event, &pmu->active_list, active_entry) + list_for_each_entry(event, &rapl_pmu->active_list, active_entry) rapl_event_update(event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); - hrtimer_forward_now(hrtimer, pmu->timer_interval); + hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); return HRTIMER_RESTART; } -static void rapl_hrtimer_init(struct rapl_pmu *pmu) +static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) { - struct hrtimer *hr = &pmu->hrtimer; + struct hrtimer *hr = &rapl_pmu->hrtimer; hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hr->function = rapl_hrtimer_handle; } -static void __rapl_pmu_event_start(struct rapl_pmu *pmu, +static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, struct perf_event *event) { if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) @@ -263,39 +263,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, event->hw.state = 0; - list_add_tail(&event->active_entry, &pmu->active_list); + list_add_tail(&event->active_entry, &rapl_pmu->active_list); local64_set(&event->hw.prev_count, rapl_read_counter(event)); - pmu->n_active++; - if (pmu->n_active == 1) - rapl_start_hrtimer(pmu); + rapl_pmu->n_active++; + if (rapl_pmu->n_active == 1) + rapl_start_hrtimer(rapl_pmu); } static void rapl_pmu_event_start(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); - __rapl_pmu_event_start(pmu, event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + __rapl_pmu_event_start(rapl_pmu, event); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static void rapl_pmu_event_stop(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); /* mark event as deactivated and stopped */ if (!(hwc->state & PERF_HES_STOPPED)) { - WARN_ON_ONCE(pmu->n_active <= 0); - pmu->n_active--; - if (pmu->n_active == 0) - hrtimer_cancel(&pmu->hrtimer); + WARN_ON_ONCE(rapl_pmu->n_active <= 0); + rapl_pmu->n_active--; + if (rapl_pmu->n_active == 0) + hrtimer_cancel(&rapl_pmu->hrtimer); list_del(&event->active_entry); @@ -313,23 +313,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) hwc->state |= PERF_HES_UPTODATE; } - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static int rapl_pmu_event_add(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (mode & PERF_EF_START) - __rapl_pmu_event_start(pmu, event); + __rapl_pmu_event_start(rapl_pmu, event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); return 0; } @@ -343,7 +343,7 @@ static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; int bit, ret = 0; - struct rapl_pmu *pmu; + struct rapl_pmu *rapl_pmu; unsigned int rapl_pmu_idx; /* only look at RAPL events */ @@ -376,10 +376,11 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; /* must be done before validate_group */ - pmu = rapl_pmus->pmus[rapl_pmu_idx]; - if (!pmu) + rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + if (!rapl_pmu) return -EINVAL; - event->pmu_private = pmu; + + event->pmu_private = rapl_pmu; event->hw.event_base = rapl_msrs[bit].msr; event->hw.config = cfg; event->hw.idx = bit; @@ -606,7 +607,7 @@ static void cleanup_rapl_pmus(void) int i; for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) - kfree(rapl_pmus->pmus[i]); + kfree(rapl_pmus->rapl_pmu[i]); kfree(rapl_pmus); } @@ -621,27 +622,27 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmu(void) { - struct rapl_pmu *pmu; + struct rapl_pmu *rapl_pmu; int idx; for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { - pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); - if (!pmu) + rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); + if (!rapl_pmu) goto free; - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - rapl_hrtimer_init(pmu); + raw_spin_lock_init(&rapl_pmu->lock); + INIT_LIST_HEAD(&rapl_pmu->active_list); + rapl_pmu->pmu = &rapl_pmus->pmu; + rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(rapl_pmu); - rapl_pmus->pmus[idx] = pmu; + rapl_pmus->rapl_pmu[idx] = rapl_pmu; } return 0; free: for (; idx > 0; idx--) - kfree(rapl_pmus->pmus[idx - 1]); + kfree(rapl_pmus->rapl_pmu[idx - 1]); return -ENOMEM; } @@ -655,7 +656,7 @@ static int __init init_rapl_pmus(void) rapl_pmu_scope = PERF_PMU_SCOPE_DIE; } - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; From cd29d83a6d815bf8472c9aa3cdd1dcb89cc4c419 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:01 +0000 Subject: [PATCH 020/224] perf/x86/rapl: Make rapl_model struct global Prepare for the addition of RAPL core energy counter support. As there will always be just one rapl_model variable on a system, make it global, to make it easier to access it from any function. No functional change. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Zhang Rui Reviewed-by: "Gautham R. Shenoy" Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-6-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 9b1ec8a80241..104968648f33 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -151,6 +151,7 @@ static struct rapl_pmus *rapl_pmus; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; static struct perf_msr *rapl_msrs; +static struct rapl_model *rapl_model; /* * Helper function to get the correct topology id according to the @@ -542,18 +543,18 @@ static struct perf_msr amd_rapl_msrs[] = { [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; -static int rapl_check_hw_unit(struct rapl_model *rm) +static int rapl_check_hw_unit(void) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; - switch (rm->unit_quirk) { + switch (rapl_model->unit_quirk) { /* * DRAM domain on HSW server and KNL has fixed energy unit which can be * different than the unit from power unit MSR. See @@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; - struct rapl_model *rm; int ret; id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; - rm = (struct rapl_model *) id->driver_data; + rapl_model = (struct rapl_model *) id->driver_data; - rapl_msrs = rm->rapl_msrs; + rapl_msrs = rapl_model->rapl_msrs; rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, - false, (void *) &rm->events); + false, (void *) &rapl_model->events); - ret = rapl_check_hw_unit(rm); + ret = rapl_check_hw_unit(); if (ret) return ret; From eeca4c6b2529ff41a10519952bf988c0f3605353 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:02 +0000 Subject: [PATCH 021/224] perf/x86/rapl: Add arguments to the init and cleanup functions Prepare for the addition of RAPL core energy counter support. Add arguments to the init and cleanup functions, which will help in initialization and cleaning up of two separate PMUs. No functional change. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-7-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 104968648f33..249bcd361969 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -603,7 +603,7 @@ static void __init rapl_advertise(void) } } -static void cleanup_rapl_pmus(void) +static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) { int i; @@ -621,7 +621,7 @@ static const struct attribute_group *rapl_attr_update[] = { NULL, }; -static int __init init_rapl_pmu(void) +static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) { struct rapl_pmu *rapl_pmu; int idx; @@ -647,20 +647,20 @@ free: return -ENOMEM; } -static int __init init_rapl_pmus(void) +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope) { int nr_rapl_pmu = topology_max_packages(); - int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; + struct rapl_pmus *rapl_pmus; - if (!rapl_pmu_is_pkg_scope()) { - nr_rapl_pmu *= topology_max_dies_per_package(); - rapl_pmu_scope = PERF_PMU_SCOPE_DIE; - } + if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) + nr_rapl_pmu *= topology_max_dies_per_package(); rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; + *rapl_pmus_ptr = rapl_pmus; + rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; rapl_pmus->pmu.attr_groups = rapl_attr_groups; rapl_pmus->pmu.attr_update = rapl_attr_update; @@ -675,7 +675,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; - return init_rapl_pmu(); + return init_rapl_pmu(rapl_pmus); } static struct rapl_model model_snb = { @@ -799,8 +799,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; + int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; int ret; + if (rapl_pmu_is_pkg_scope()) + rapl_pmu_scope = PERF_PMU_SCOPE_PKG; + id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; @@ -816,7 +820,7 @@ static int __init rapl_pmu_init(void) if (ret) return ret; - ret = init_rapl_pmus(); + ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope); if (ret) return ret; @@ -829,7 +833,7 @@ static int __init rapl_pmu_init(void) out: pr_warn("Initialization failed (%d), disabled\n", ret); - cleanup_rapl_pmus(); + cleanup_rapl_pmus(rapl_pmus); return ret; } module_init(rapl_pmu_init); @@ -837,6 +841,6 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { perf_pmu_unregister(&rapl_pmus->pmu); - cleanup_rapl_pmus(); + cleanup_rapl_pmus(rapl_pmus); } module_exit(intel_rapl_exit); From abf03d9bd20cf55ebdc4c7f0955d21759aeb0523 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:03 +0000 Subject: [PATCH 022/224] perf/x86/rapl: Modify the generic variable names to *_pkg* Prepare for the addition of RAPL core energy counter support. Replace the generic names with *_pkg*, to later on differentiate between the scopes of the two different PMUs and their variables. No functional change. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-8-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 120 ++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 249bcd361969..8cdc5787c866 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -70,18 +70,18 @@ MODULE_LICENSE("GPL"); /* * RAPL energy status counters */ -enum perf_rapl_events { +enum perf_rapl_pkg_events { PERF_RAPL_PP0 = 0, /* all cores */ PERF_RAPL_PKG, /* entire package */ PERF_RAPL_RAM, /* DRAM */ PERF_RAPL_PP1, /* gpu */ PERF_RAPL_PSYS, /* psys */ - PERF_RAPL_MAX, - NR_RAPL_DOMAINS = PERF_RAPL_MAX, + PERF_RAPL_PKG_EVENTS_MAX, + NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, }; -static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { +static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { "pp0-core", "package", "dram", @@ -112,7 +112,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \ * considered as either pkg-scope or die-scope, and we are considering * them as die-scope. */ -#define rapl_pmu_is_pkg_scope() \ +#define rapl_pkg_pmu_is_pkg_scope() \ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) @@ -139,16 +139,16 @@ enum rapl_unit_quirk { }; struct rapl_model { - struct perf_msr *rapl_msrs; - unsigned long events; + struct perf_msr *rapl_pkg_msrs; + unsigned long pkg_events; unsigned int msr_power_unit; enum rapl_unit_quirk unit_quirk; }; /* 1/2^hw_unit Joule */ -static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; -static struct rapl_pmus *rapl_pmus; -static unsigned int rapl_cntr_mask; +static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; +static struct rapl_pmus *rapl_pmus_pkg; +static unsigned int rapl_pkg_cntr_mask; static u64 rapl_timer_ms; static struct perf_msr *rapl_msrs; static struct rapl_model *rapl_model; @@ -164,8 +164,8 @@ static inline unsigned int get_rapl_pmu_idx(int cpu) * (for non-existent mappings in topology map) to UINT_MAX, so * the error check in the caller is simplified. */ - return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : - topology_logical_die_id(cpu); + return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : + topology_logical_die_id(cpu); } static inline u64 rapl_read_counter(struct perf_event *event) @@ -177,7 +177,7 @@ static inline u64 rapl_read_counter(struct perf_event *event) static inline u64 rapl_scale(u64 v, int cfg) { - if (cfg > NR_RAPL_DOMAINS) { + if (cfg > NR_RAPL_PKG_DOMAINS) { pr_warn("Invalid domain %d, failed to scale data\n", cfg); return v; } @@ -187,7 +187,7 @@ static inline u64 rapl_scale(u64 v, int cfg) * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - rapl_hw_unit[cfg - 1]); + return v << (32 - rapl_pkg_hw_unit[cfg - 1]); } static u64 rapl_event_update(struct perf_event *event) @@ -348,7 +348,7 @@ static int rapl_pmu_event_init(struct perf_event *event) unsigned int rapl_pmu_idx; /* only look at RAPL events */ - if (event->attr.type != rapl_pmus->pmu.type) + if (event->attr.type != rapl_pmus_pkg->pmu.type) return -ENOENT; /* check only supported bits are set */ @@ -358,14 +358,14 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; - cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); + cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); bit = cfg - 1; /* check event supported */ - if (!(rapl_cntr_mask & (1 << bit))) + if (!(rapl_pkg_cntr_mask & (1 << bit))) return -EINVAL; /* unsupported modes and filters */ @@ -373,11 +373,11 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); - if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu) return -EINVAL; /* must be done before validate_group */ - rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx]; if (!rapl_pmu) return -EINVAL; @@ -531,11 +531,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { }; /* - * Force to PERF_RAPL_MAX size due to: - * - perf_msr_probe(PERF_RAPL_MAX) + * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: + * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) * - want to use same event codes across both architectures */ -static struct perf_msr amd_rapl_msrs[] = { +static struct perf_msr amd_rapl_pkg_msrs[] = { [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, @@ -551,8 +551,8 @@ static int rapl_check_hw_unit(void) /* protect rdmsrl() to handle virtualization */ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; - for (i = 0; i < NR_RAPL_DOMAINS; i++) - rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) + rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; switch (rapl_model->unit_quirk) { /* @@ -562,11 +562,11 @@ static int rapl_check_hw_unit(void) * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ case RAPL_UNIT_QUIRK_INTEL_HSW: - rapl_hw_unit[PERF_RAPL_RAM] = 16; + rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; break; /* SPR uses a fixed energy unit for Psys domain. */ case RAPL_UNIT_QUIRK_INTEL_SPR: - rapl_hw_unit[PERF_RAPL_PSYS] = 0; + rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; break; default: break; @@ -581,9 +581,9 @@ static int rapl_check_hw_unit(void) * if hw unit is 32, then we use 2 ms 1/200/2 */ rapl_timer_ms = 2; - if (rapl_hw_unit[0] < 32) { + if (rapl_pkg_hw_unit[0] < 32) { rapl_timer_ms = (1000 / (2 * 100)); - rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); + rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); } return 0; } @@ -593,12 +593,12 @@ static void __init rapl_advertise(void) int i; pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", - hweight32(rapl_cntr_mask), rapl_timer_ms); + hweight32(rapl_pkg_cntr_mask), rapl_timer_ms); - for (i = 0; i < NR_RAPL_DOMAINS; i++) { - if (rapl_cntr_mask & (1 << i)) { + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { + if (rapl_pkg_cntr_mask & (1 << i)) { pr_info("hw unit of domain %s 2^-%d Joules\n", - rapl_domain_names[i], rapl_hw_unit[i]); + rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); } } } @@ -679,71 +679,71 @@ static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_ } static struct rapl_model model_snb = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { - .events = BIT(PERF_RAPL_PKG) | + .pkg_events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_spr = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PSYS), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_spr_msrs, + .rapl_pkg_msrs = intel_rapl_spr_msrs, }; static struct rapl_model model_amd_hygon = { - .events = BIT(PERF_RAPL_PKG), + .pkg_events = BIT(PERF_RAPL_PKG), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, - .rapl_msrs = amd_rapl_msrs, + .rapl_pkg_msrs = amd_rapl_pkg_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -799,11 +799,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; - int rapl_pmu_scope = PERF_PMU_SCOPE_DIE; + int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; int ret; - if (rapl_pmu_is_pkg_scope()) - rapl_pmu_scope = PERF_PMU_SCOPE_PKG; + if (rapl_pkg_pmu_is_pkg_scope()) + rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; id = x86_match_cpu(rapl_model_match); if (!id) @@ -811,20 +811,20 @@ static int __init rapl_pmu_init(void) rapl_model = (struct rapl_model *) id->driver_data; - rapl_msrs = rapl_model->rapl_msrs; + rapl_msrs = rapl_model->rapl_pkg_msrs; - rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, - false, (void *) &rapl_model->events); + rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX, + false, (void *) &rapl_model->pkg_events); ret = rapl_check_hw_unit(); if (ret) return ret; - ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope); + ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope); if (ret) return ret; - ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); + ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); if (ret) goto out; @@ -833,14 +833,14 @@ static int __init rapl_pmu_init(void) out: pr_warn("Initialization failed (%d), disabled\n", ret); - cleanup_rapl_pmus(rapl_pmus); + cleanup_rapl_pmus(rapl_pmus_pkg); return ret; } module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { - perf_pmu_unregister(&rapl_pmus->pmu); - cleanup_rapl_pmus(rapl_pmus); + perf_pmu_unregister(&rapl_pmus_pkg->pmu); + cleanup_rapl_pmus(rapl_pmus_pkg); } module_exit(intel_rapl_exit); From bdc57ec7054842e5cb3b0a2da87b0e73075a96e6 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:04 +0000 Subject: [PATCH 023/224] perf/x86/rapl: Remove the global variable rapl_msrs Prepare for the addition of RAPL core energy counter support. After making the rapl_model struct global, the rapl_msrs global variable isn't needed, so remove it. Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-9-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8cdc5787c866..aef2d0e86aba 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -150,7 +150,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; static struct rapl_pmus *rapl_pmus_pkg; static unsigned int rapl_pkg_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr *rapl_msrs; static struct rapl_model *rapl_model; /* @@ -382,7 +381,7 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; event->pmu_private = rapl_pmu; - event->hw.event_base = rapl_msrs[bit].msr; + event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; event->hw.config = cfg; event->hw.idx = bit; @@ -811,9 +810,7 @@ static int __init rapl_pmu_init(void) rapl_model = (struct rapl_model *) id->driver_data; - rapl_msrs = rapl_model->rapl_pkg_msrs; - - rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX, + rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, false, (void *) &rapl_model->pkg_events); ret = rapl_check_hw_unit(); From 54d2759778c1ebd66ee42fac93acf0c2cbf4217c Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:05 +0000 Subject: [PATCH 024/224] perf/x86/rapl: Move the cntr_mask to rapl_pmus struct Prepare for the addition of RAPL core energy counter support. Move cntr_mask to rapl_pmus struct instead of adding a new global cntr_mask for the new RAPL power_core PMU. This will also ensure that the second "core_cntr_mask" is only created if needed (i.e. in case of AMD CPUs). Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Reviewed-by: Zhang Rui Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20241115060805.447565-10-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index aef2d0e86aba..139c3086b831 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -129,6 +129,7 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; unsigned int nr_rapl_pmu; + unsigned int cntr_mask; struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); }; @@ -148,7 +149,6 @@ struct rapl_model { /* 1/2^hw_unit Joule */ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; static struct rapl_pmus *rapl_pmus_pkg; -static unsigned int rapl_pkg_cntr_mask; static u64 rapl_timer_ms; static struct rapl_model *rapl_model; @@ -364,7 +364,7 @@ static int rapl_pmu_event_init(struct perf_event *event) bit = cfg - 1; /* check event supported */ - if (!(rapl_pkg_cntr_mask & (1 << bit))) + if (!(rapl_pmus_pkg->cntr_mask & (1 << bit))) return -EINVAL; /* unsupported modes and filters */ @@ -592,10 +592,10 @@ static void __init rapl_advertise(void) int i; pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", - hweight32(rapl_pkg_cntr_mask), rapl_timer_ms); + hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms); for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { - if (rapl_pkg_cntr_mask & (1 << i)) { + if (rapl_pmus_pkg->cntr_mask & (1 << i)) { pr_info("hw unit of domain %s 2^-%d Joules\n", rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); } @@ -810,9 +810,6 @@ static int __init rapl_pmu_init(void) rapl_model = (struct rapl_model *) id->driver_data; - rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX, - false, (void *) &rapl_model->pkg_events); - ret = rapl_check_hw_unit(); if (ret) return ret; @@ -821,6 +818,10 @@ static int __init rapl_pmu_init(void) if (ret) return ret; + rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, + PERF_RAPL_PKG_EVENTS_MAX, false, + (void *) &rapl_model->pkg_events); + ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); if (ret) goto out; From b4943b8bfc41ddd3796f3b87e1efa71a0c689f22 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Fri, 15 Nov 2024 06:08:06 +0000 Subject: [PATCH 025/224] perf/x86/rapl: Add core energy counter support for AMD CPUs Add a new "power_core" PMU and "energy-core" event for monitoring energy consumption by each individual core. The existing energy-cores event aggregates the energy consumption of CPU cores at the package level. This new event aligns with the AMD's per-core energy counters. Tested the package level and core level PMU counters with workloads pinned to different CPUs. Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa machine: $ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s stress-ng: info: [21250] setting to a 5 second run per stressor stress-ng: info: [21250] dispatching hogs: 1 matrix stress-ng: info: [21250] successful run completed in 5.00s Performance counter stats for 'system wide': S0-D0-C0 1 0.00 Joules power_core/energy-core/ S0-D0-C1 1 0.00 Joules power_core/energy-core/ S0-D0-C2 1 0.00 Joules power_core/energy-core/ S0-D0-C3 1 0.00 Joules power_core/energy-core/ S0-D0-C4 1 8.43 Joules power_core/energy-core/ S0-D0-C5 1 0.00 Joules power_core/energy-core/ S0-D0-C6 1 0.00 Joules power_core/energy-core/ S0-D0-C7 1 0.00 Joules power_core/energy-core/ S0-D1-C8 1 0.00 Joules power_core/energy-core/ S0-D1-C9 1 0.00 Joules power_core/energy-core/ S0-D1-C10 1 0.00 Joules power_core/energy-core/ Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Gautham R. Shenoy" Link: https://lore.kernel.org/r/20241115060805.447565-11-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 181 ++++++++++++++++++++++++++++++++++------- 1 file changed, 150 insertions(+), 31 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 139c3086b831..d3bb3865c1b1 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -39,6 +39,10 @@ * event: rapl_energy_psys * perf code: 0x5 * + * core counter: consumption of a single physical core + * event: rapl_energy_core (power_core PMU) + * perf code: 0x1 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -81,6 +85,10 @@ enum perf_rapl_pkg_events { NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, }; +#define PERF_RAPL_CORE 0 /* single core */ +#define PERF_RAPL_CORE_EVENTS_MAX 1 +#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX + static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { "pp0-core", "package", @@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst "psys", }; +static const char *const rapl_core_domain_name __initconst = "core"; + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -141,14 +151,18 @@ enum rapl_unit_quirk { struct rapl_model { struct perf_msr *rapl_pkg_msrs; + struct perf_msr *rapl_core_msrs; unsigned long pkg_events; + unsigned long core_events; unsigned int msr_power_unit; enum rapl_unit_quirk unit_quirk; }; /* 1/2^hw_unit Joule */ static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; +static int rapl_core_hw_unit __read_mostly; static struct rapl_pmus *rapl_pmus_pkg; +static struct rapl_pmus *rapl_pmus_core; static u64 rapl_timer_ms; static struct rapl_model *rapl_model; @@ -156,15 +170,23 @@ static struct rapl_model *rapl_model; * Helper function to get the correct topology id according to the * RAPL PMU scope. */ -static inline unsigned int get_rapl_pmu_idx(int cpu) +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) { /* * Returns unsigned int, which converts the '-1' return value * (for non-existent mappings in topology map) to UINT_MAX, so * the error check in the caller is simplified. */ - return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : - topology_logical_die_id(cpu); + switch (scope) { + case PERF_PMU_SCOPE_PKG: + return topology_logical_package_id(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_logical_die_id(cpu); + case PERF_PMU_SCOPE_CORE: + return topology_logical_core_id(cpu); + default: + return -EINVAL; + } } static inline u64 rapl_read_counter(struct perf_event *event) @@ -174,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event) return raw; } -static inline u64 rapl_scale(u64 v, int cfg) +static inline u64 rapl_scale(u64 v, struct perf_event *event) { - if (cfg > NR_RAPL_PKG_DOMAINS) { - pr_warn("Invalid domain %d, failed to scale data\n", cfg); - return v; - } + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; + + if (event->pmu->scope == PERF_PMU_SCOPE_CORE) + hw_unit = rapl_core_hw_unit; + /* * scale delta to smallest unit (1/2^32) * users must then scale back: count * 1/(1e9*2^32) to get Joules * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - rapl_pkg_hw_unit[cfg - 1]); + return v << (32 - hw_unit); } static u64 rapl_event_update(struct perf_event *event) @@ -213,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event) delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - sdelta = rapl_scale(delta, event->hw.config); + sdelta = rapl_scale(delta, event); local64_add(sdelta, &event->count); @@ -342,13 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, ret = 0; + int bit, rapl_pmus_scope, ret = 0; struct rapl_pmu *rapl_pmu; unsigned int rapl_pmu_idx; + struct rapl_pmus *rapl_pmus; - /* only look at RAPL events */ - if (event->attr.type != rapl_pmus_pkg->pmu.type) - return -ENOENT; + /* unsupported modes and filters */ + if (event->attr.sample_period) /* no sampling */ + return -EINVAL; /* check only supported bits are set */ if (event->attr.config & ~RAPL_EVENT_MASK) @@ -357,31 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); + if (!rapl_pmus) return -EINVAL; + rapl_pmus_scope = rapl_pmus->pmu.scope; - cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); - bit = cfg - 1; + if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { + /* only look at RAPL package events */ + if (event->attr.type != rapl_pmus_pkg->pmu.type) + return -ENOENT; + + cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + + bit = cfg - 1; + event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; + } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { + /* only look at RAPL core events */ + if (event->attr.type != rapl_pmus_core->pmu.type) + return -ENOENT; + + cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + + bit = cfg - 1; + event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; + } else + return -EINVAL; /* check event supported */ - if (!(rapl_pmus_pkg->cntr_mask & (1 << bit))) + if (!(rapl_pmus->cntr_mask & (1 << bit))) return -EINVAL; - /* unsupported modes and filters */ - if (event->attr.sample_period) /* no sampling */ + rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) return -EINVAL; - - rapl_pmu_idx = get_rapl_pmu_idx(event->cpu); - if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu) - return -EINVAL; - /* must be done before validate_group */ - rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx]; + rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; if (!rapl_pmu) return -EINVAL; event->pmu_private = rapl_pmu; - event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; event->hw.config = cfg; event->hw.idx = bit; @@ -398,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); +RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR @@ -413,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); /* * There are no default events, but we need to create @@ -445,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = { NULL, }; +static const struct attribute_group *rapl_core_attr_groups[] = { + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), EVENT_PTR(rapl_cores_unit), @@ -505,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = { .attrs = rapl_events_psys, }; +static struct attribute *rapl_events_core[] = { + EVENT_PTR(rapl_core), + EVENT_PTR(rapl_core_unit), + EVENT_PTR(rapl_core_scale), + NULL, +}; + +static struct attribute_group rapl_events_core_group = { + .name = "events", + .attrs = rapl_events_core, +}; + static bool test_msr(int idx, void *data) { return test_bit(idx, (unsigned long *) data); @@ -542,6 +605,11 @@ static struct perf_msr amd_rapl_pkg_msrs[] = { [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; +static struct perf_msr amd_rapl_core_msrs[] = { + [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, + test_msr, false, RAPL_MSR_MASK }, +}; + static int rapl_check_hw_unit(void) { u64 msr_rapl_power_unit_bits; @@ -553,6 +621,8 @@ static int rapl_check_hw_unit(void) for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + switch (rapl_model->unit_quirk) { /* * DRAM domain on HSW server and KNL has fixed energy unit which can be @@ -571,7 +641,6 @@ static int rapl_check_hw_unit(void) break; } - /* * Calculate the timer rate: * Use reference of 200W for scaling the timeout to avoid counter @@ -590,9 +659,13 @@ static int rapl_check_hw_unit(void) static void __init rapl_advertise(void) { int i; + int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); + + if (rapl_pmus_core) + num_counters += hweight32(rapl_pmus_core->cntr_mask); pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", - hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms); + num_counters, rapl_timer_ms); for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { if (rapl_pmus_pkg->cntr_mask & (1 << i)) { @@ -600,6 +673,10 @@ static void __init rapl_advertise(void) rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); } } + + if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_core_domain_name, rapl_core_hw_unit); } static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) @@ -620,6 +697,11 @@ static const struct attribute_group *rapl_attr_update[] = { NULL, }; +static const struct attribute_group *rapl_core_attr_update[] = { + &rapl_events_core_group, + NULL, +}; + static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) { struct rapl_pmu *rapl_pmu; @@ -646,13 +728,22 @@ free: return -ENOMEM; } -static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope) +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, + const struct attribute_group **rapl_attr_groups, + const struct attribute_group **rapl_attr_update) { int nr_rapl_pmu = topology_max_packages(); struct rapl_pmus *rapl_pmus; + /* + * rapl_pmu_scope must be either PKG, DIE or CORE + */ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) nr_rapl_pmu *= topology_max_dies_per_package(); + else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) + nr_rapl_pmu *= topology_num_cores_per_package(); + else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) + return -EINVAL; rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) @@ -741,8 +832,10 @@ static struct rapl_model model_spr = { static struct rapl_model model_amd_hygon = { .pkg_events = BIT(PERF_RAPL_PKG), + .core_events = BIT(PERF_RAPL_CORE), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .rapl_pkg_msrs = amd_rapl_pkg_msrs, + .rapl_core_msrs = amd_rapl_core_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -814,7 +907,8 @@ static int __init rapl_pmu_init(void) if (ret) return ret; - ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope); + ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, + rapl_attr_update); if (ret) return ret; @@ -826,6 +920,27 @@ static int __init rapl_pmu_init(void) if (ret) goto out; + if (rapl_model->core_events) { + ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, + rapl_core_attr_groups, + rapl_core_attr_update); + if (ret) { + pr_warn("power-core PMU initialization failed (%d)\n", ret); + goto core_init_failed; + } + + rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, + PERF_RAPL_CORE_EVENTS_MAX, false, + (void *) &rapl_model->core_events); + + ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); + if (ret) { + pr_warn("power-core PMU registration failed (%d)\n", ret); + cleanup_rapl_pmus(rapl_pmus_core); + } + } + +core_init_failed: rapl_advertise(); return 0; @@ -838,6 +953,10 @@ module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { + if (rapl_pmus_core) { + perf_pmu_unregister(&rapl_pmus_core->pmu); + cleanup_rapl_pmus(rapl_pmus_core); + } perf_pmu_unregister(&rapl_pmus_pkg->pmu); cleanup_rapl_pmus(rapl_pmus_pkg); } From 96450ead16527cbef559b5bd046182e731228f95 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: [PATCH 026/224] seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization From 7528585290a1a1d4e0fb4b72261eb2d8c85de2d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Nov 2024 12:47:48 +0100 Subject: [PATCH 027/224] mm/gup: Use raw_seqcount_try_begin() David pointed out that gup_fast() does exactly what the new raw_seqcount_try_begin() does -- use it. Suggested-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Hildenbrand --- mm/gup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 746070a1d8bf..81ffbd8fec9c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3351,8 +3351,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, return 0; if (gup_flags & FOLL_PIN) { - seq = raw_read_seqcount(¤t->mm->write_protect_seq); - if (seq & 1) + if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq)) return 0; } From eb449bd96954b1c1e491d19066cfd2a010f0aa47 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: [PATCH 028/224] mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com --- include/linux/mm.h | 12 ++-- include/linux/mm_types.h | 7 ++- include/linux/mmap_lock.h | 97 +++++++++++++++++++------------- kernel/fork.c | 5 +- mm/init-mm.c | 2 +- tools/testing/vma/vma.c | 4 +- tools/testing/vma/vma_internal.h | 4 +- 7 files changed, 74 insertions(+), 57 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..ca59d165f1f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -710,7 +710,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -727,7 +727,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -742,7 +742,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -750,7 +750,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -761,7 +761,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -779,7 +779,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..97e2f4fe1d6c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -697,7 +697,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -891,6 +891,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -899,7 +902,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,62 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK +static inline void mm_lock_seqcount_init(struct mm_struct *mm) +{ + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); +} + +#else +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} +#endif + +static inline void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +} + +static inline void mmap_write_lock(struct mm_struct *mm) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) +{ + __mmap_lock_trace_start_locking(mm, true); + down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, true); +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + int ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); + return ret; +} + /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing @@ -82,46 +138,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); -} -#else -static inline void vma_end_write_all(struct mm_struct *mm) {} -#endif - -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); -} - -static inline void mmap_write_lock(struct mm_struct *mm) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) -{ - __mmap_lock_trace_start_locking(mm, true); - down_write_nested(&mm->mmap_lock, subclass); - __mmap_lock_trace_acquire_returned(mm, true, true); -} - -static inline int mmap_write_lock_killable(struct mm_struct *mm) -{ - int ret; - - __mmap_lock_trace_start_locking(mm, true); - ret = down_write_killable(&mm->mmap_lock); - __mmap_lock_trace_acquire_returned(mm, true, ret == 0); - return ret; + mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index 1450b461d196..8dc670fe90d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } @@ -1267,9 +1267,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..6af3ad675930 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,7 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 8fab5e13c7c3..9bcf1736bf18 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, * begun. Linking to the tree will have caused this to be incremented, * which means we will get a false positive otherwise. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return vma; } @@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma) int seq = vma->vm_lock_seq; /* We reset after each check. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; /* The vma_start_write() stub simply increments this value. */ return seq > -1; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index e76ff579e1fd..1d9fc97b8e80 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -241,7 +241,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; struct vma_lock *vm_lock; #endif @@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } From 03a001b156d2da186a5618de242750d06bf81e2d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: [PATCH 029/224] mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { From 83e3dc9a5d4d7402adb24090a77327245d593129 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 21 Nov 2024 19:59:21 -0800 Subject: [PATCH 030/224] uprobes: simplify find_active_uprobe_rcu() VMA checks At the point where find_active_uprobe_rcu() is used we know that VMA in question has triggered software breakpoint, so we don't need to validate vma->vm_flags. Keep only vma->vm_file NULL check. Suggested-by: Oleg Nesterov Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20241122035922.3321100-2-andrii@kernel.org --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fa04b14a7d72..62c14dffa1ba 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2304,7 +2304,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { - if (valid_vma(vma, false)) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); From e0925f2dc4de2d8ba987392d3239e8edf88f8b96 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 21 Nov 2024 19:59:22 -0800 Subject: [PATCH 031/224] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given filp_cachep is marked SLAB_TYPESAFE_BY_RCU (and FMODE_BACKING files, a special case, now goes through RCU-delated freeing), we can safely access vma->vm_file->f_inode field locklessly under just rcu_read_lock() protection, which enables looking up uprobe from uprobes_tree completely locklessly and speculatively without the need to acquire mmap_lock for reads. In most cases, anyway, assuming that there are no parallel mm and/or VMA modifications. The underlying struct file's memory won't go away from under us (even if struct file can be reused in the meantime). We rely on newly added mmap_lock_speculate_{try_begin,retry}() helpers to validate that mm_struct stays intact for entire duration of this speculation. If not, we fall back to mmap_lock-protected lookup. The speculative logic is written in such a way that it will safely handle any garbage values that might be read from vma or file structs. Benchmarking results speak for themselves. BEFORE (latest tip/perf/core) ============================= uprobe-nop ( 1 cpus): 3.384 ± 0.004M/s ( 3.384M/s/cpu) uprobe-nop ( 2 cpus): 5.456 ± 0.005M/s ( 2.728M/s/cpu) uprobe-nop ( 3 cpus): 7.863 ± 0.015M/s ( 2.621M/s/cpu) uprobe-nop ( 4 cpus): 9.442 ± 0.008M/s ( 2.360M/s/cpu) uprobe-nop ( 5 cpus): 11.036 ± 0.013M/s ( 2.207M/s/cpu) uprobe-nop ( 6 cpus): 10.884 ± 0.019M/s ( 1.814M/s/cpu) uprobe-nop ( 7 cpus): 7.897 ± 0.145M/s ( 1.128M/s/cpu) uprobe-nop ( 8 cpus): 10.021 ± 0.128M/s ( 1.253M/s/cpu) uprobe-nop (10 cpus): 9.932 ± 0.170M/s ( 0.993M/s/cpu) uprobe-nop (12 cpus): 8.369 ± 0.056M/s ( 0.697M/s/cpu) uprobe-nop (14 cpus): 8.678 ± 0.017M/s ( 0.620M/s/cpu) uprobe-nop (16 cpus): 7.392 ± 0.003M/s ( 0.462M/s/cpu) uprobe-nop (24 cpus): 5.326 ± 0.178M/s ( 0.222M/s/cpu) uprobe-nop (32 cpus): 5.426 ± 0.059M/s ( 0.170M/s/cpu) uprobe-nop (40 cpus): 5.262 ± 0.070M/s ( 0.132M/s/cpu) uprobe-nop (48 cpus): 6.121 ± 0.010M/s ( 0.128M/s/cpu) uprobe-nop (56 cpus): 6.252 ± 0.035M/s ( 0.112M/s/cpu) uprobe-nop (64 cpus): 7.644 ± 0.023M/s ( 0.119M/s/cpu) uprobe-nop (72 cpus): 7.781 ± 0.001M/s ( 0.108M/s/cpu) uprobe-nop (80 cpus): 8.992 ± 0.048M/s ( 0.112M/s/cpu) AFTER ===== uprobe-nop ( 1 cpus): 3.534 ± 0.033M/s ( 3.534M/s/cpu) uprobe-nop ( 2 cpus): 6.701 ± 0.007M/s ( 3.351M/s/cpu) uprobe-nop ( 3 cpus): 10.031 ± 0.007M/s ( 3.344M/s/cpu) uprobe-nop ( 4 cpus): 13.003 ± 0.012M/s ( 3.251M/s/cpu) uprobe-nop ( 5 cpus): 16.274 ± 0.006M/s ( 3.255M/s/cpu) uprobe-nop ( 6 cpus): 19.563 ± 0.024M/s ( 3.261M/s/cpu) uprobe-nop ( 7 cpus): 22.696 ± 0.054M/s ( 3.242M/s/cpu) uprobe-nop ( 8 cpus): 24.534 ± 0.010M/s ( 3.067M/s/cpu) uprobe-nop (10 cpus): 30.475 ± 0.117M/s ( 3.047M/s/cpu) uprobe-nop (12 cpus): 33.371 ± 0.017M/s ( 2.781M/s/cpu) uprobe-nop (14 cpus): 38.864 ± 0.004M/s ( 2.776M/s/cpu) uprobe-nop (16 cpus): 41.476 ± 0.020M/s ( 2.592M/s/cpu) uprobe-nop (24 cpus): 64.696 ± 0.021M/s ( 2.696M/s/cpu) uprobe-nop (32 cpus): 85.054 ± 0.027M/s ( 2.658M/s/cpu) uprobe-nop (40 cpus): 101.979 ± 0.032M/s ( 2.549M/s/cpu) uprobe-nop (48 cpus): 110.518 ± 0.056M/s ( 2.302M/s/cpu) uprobe-nop (56 cpus): 117.737 ± 0.020M/s ( 2.102M/s/cpu) uprobe-nop (64 cpus): 124.613 ± 0.079M/s ( 1.947M/s/cpu) uprobe-nop (72 cpus): 133.239 ± 0.032M/s ( 1.851M/s/cpu) uprobe-nop (80 cpus): 142.037 ± 0.138M/s ( 1.775M/s/cpu) Previously total throughput was maxing out at 11mln/s, and gradually declining past 8 cores. With this change, it now keeps growing with each added CPU, reaching 142mln/s at 80 CPUs (this was measured on a 80-core Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz). Suggested-by: Matthew Wilcox Suggested-by: Peter Zijlstra Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20241122035922.3321100-3-andrii@kernel.org --- kernel/events/uprobes.c | 45 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 62c14dffa1ba..daf4314961ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2294,6 +2294,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { @@ -2301,6 +2342,10 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { From 2116b349e29a2e9ba17ea2e45b31234e4b350793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:52 +0100 Subject: [PATCH 032/224] objtool: Generic annotation infrastructure Avoid endless .discard.foo sections for each annotation, create a single .discard.annotate_insn section that takes an annotation type along with the instruction. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094310.932794537@infradead.org --- include/linux/objtool.h | 18 +++++++++++++++++ tools/objtool/check.c | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b3b8d3dab52d..d98531ecc687 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -57,6 +57,13 @@ ".long 998b\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ + ".long 911b - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + #else /* __ASSEMBLY__ */ /* @@ -146,6 +153,14 @@ .popsection .endm +.macro ANNOTATE type:req +.Lhere_\@: + .pushsection .discard.annotate_insn,"M",@progbits,8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -155,6 +170,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE #else @@ -167,6 +183,8 @@ .endm .macro REACHABLE .endm +.macro ANNOTATE type:req +.endm #endif #endif /* CONFIG_OBJTOOL */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4ce176ad411f..b0efc8ee16d6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2373,6 +2373,49 @@ static int read_unwind_hints(struct objtool_file *file) return 0; } +static int read_annotate(struct objtool_file *file, void (*func)(int type, struct instruction *insn)) +{ + struct section *sec; + struct instruction *insn; + struct reloc *reloc; + int type; + + sec = find_section_by_name(file->elf, ".discard.annotate_insn"); + if (!sec) + return 0; + + if (!sec->rsec) + return 0; + + if (sec->sh.sh_entsize != 8) { + static bool warned = false; + if (!warned) { + WARN("%s: dodgy linker, sh_entsize != 8", sec->name); + warned = true; + } + sec->sh.sh_entsize = 8; + } + + for_each_reloc(sec->rsec, reloc) { + type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4); + + insn = find_insn(file, reloc->sym->sec, + reloc->sym->offset + reloc_addend(reloc)); + if (!insn) { + WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type); + return -1; + } + + func(type, insn); + } + + return 0; +} + +static void __annotate_nop(int type, struct instruction *insn) +{ +} + static int read_noendbr_hints(struct objtool_file *file) { struct instruction *insn; @@ -2670,6 +2713,8 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + read_annotate(file, __annotate_nop); + /* * Must be before read_unwind_hints() since that needs insn->noendbr. */ From 22c3d58079688b697f36d670616e463cbb14d058 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:53 +0100 Subject: [PATCH 033/224] objtool: Convert ANNOTATE_NOENDBR to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.042140333@infradead.org --- include/linux/objtool.h | 17 ++++----------- include/linux/objtool_types.h | 5 +++++ tools/include/linux/objtool_types.h | 5 +++++ tools/objtool/check.c | 32 +++++------------------------ 4 files changed, 19 insertions(+), 40 deletions(-) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index d98531ecc687..b5e9c0ab4048 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -45,12 +45,6 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #endif -#define ANNOTATE_NOENDBR \ - "986: \n\t" \ - ".pushsection .discard.noendbr\n\t" \ - ".long 986b\n\t" \ - ".popsection\n\t" - #define ASM_REACHABLE \ "998:\n\t" \ ".pushsection .discard.reachable\n\t" \ @@ -64,6 +58,8 @@ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) + #else /* __ASSEMBLY__ */ /* @@ -122,13 +118,6 @@ #endif .endm -.macro ANNOTATE_NOENDBR -.Lhere_\@: - .pushsection .discard.noendbr - .long .Lhere_\@ - .popsection -.endm - /* * Use objtool to validate the entry requirement that all code paths do * VALIDATE_UNRET_END before RET. @@ -161,6 +150,8 @@ .popsection .endm +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 453a4f4ef39d..4884f8cf8429 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -54,4 +54,9 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_SAVE 6 #define UNWIND_HINT_TYPE_RESTORE 7 +/* + * Annotate types + */ +#define ANNOTYPE_NOENDBR 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index 453a4f4ef39d..4884f8cf8429 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -54,4 +54,9 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_SAVE 6 #define UNWIND_HINT_TYPE_RESTORE 7 +/* + * Annotate types + */ +#define ANNOTYPE_NOENDBR 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b0efc8ee16d6..a74ff26860f7 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2412,32 +2412,12 @@ static int read_annotate(struct objtool_file *file, void (*func)(int type, struc return 0; } -static void __annotate_nop(int type, struct instruction *insn) +static void __annotate_noendbr(int type, struct instruction *insn) { -} + if (type != ANNOTYPE_NOENDBR) + return; -static int read_noendbr_hints(struct objtool_file *file) -{ - struct instruction *insn; - struct section *rsec; - struct reloc *reloc; - - rsec = find_section_by_name(file->elf, ".rela.discard.noendbr"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - insn = find_insn(file, reloc->sym->sec, - reloc->sym->offset + reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.noendbr entry"); - return -1; - } - - insn->noendbr = 1; - } - - return 0; + insn->noendbr = 1; } static int read_retpoline_hints(struct objtool_file *file) @@ -2713,12 +2693,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - read_annotate(file, __annotate_nop); - /* * Must be before read_unwind_hints() since that needs insn->noendbr. */ - ret = read_noendbr_hints(file); + ret = read_annotate(file, __annotate_noendbr); if (ret) return ret; From bf5febebd99fddfc6226a94e937d38a8d470b24e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:54 +0100 Subject: [PATCH 034/224] objtool: Convert ANNOTATE_RETPOLINE_SAFE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.145275669@infradead.org --- arch/x86/include/asm/nospec-branch.h | 13 +------ include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 58 ++++++++++------------------ 4 files changed, 25 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 96b410b1d4e8..50340a125953 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -184,12 +184,7 @@ * objtool the subsequent indirect jump/call is vouched safe for retpoline * builds. */ -.macro ANNOTATE_RETPOLINE_SAFE -.Lhere_\@: - .pushsection .discard.retpoline_safe - .long .Lhere_\@ - .popsection -.endm +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE /* * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions @@ -350,11 +345,7 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_RETPOLINE_SAFE \ - "999:\n\t" \ - ".pushsection .discard.retpoline_safe\n\t" \ - ".long 999b\n\t" \ - ".popsection\n\t" +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 4884f8cf8429..1b348361ad1d 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -58,5 +58,6 @@ struct unwind_hint { * Annotate types */ #define ANNOTYPE_NOENDBR 1 +#define ANNOTYPE_RETPOLINE_SAFE 2 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index 4884f8cf8429..1b348361ad1d 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -58,5 +58,6 @@ struct unwind_hint { * Annotate types */ #define ANNOTYPE_NOENDBR 1 +#define ANNOTYPE_RETPOLINE_SAFE 2 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a74ff26860f7..c5b52309b80d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2373,12 +2373,12 @@ static int read_unwind_hints(struct objtool_file *file) return 0; } -static int read_annotate(struct objtool_file *file, void (*func)(int type, struct instruction *insn)) +static int read_annotate(struct objtool_file *file, int (*func)(int type, struct instruction *insn)) { struct section *sec; struct instruction *insn; struct reloc *reloc; - int type; + int type, ret; sec = find_section_by_name(file->elf, ".discard.annotate_insn"); if (!sec) @@ -2406,53 +2406,37 @@ static int read_annotate(struct objtool_file *file, void (*func)(int type, struc return -1; } - func(type, insn); + ret = func(type, insn); + if (ret < 0) + return ret; } return 0; } -static void __annotate_noendbr(int type, struct instruction *insn) +static int __annotate_noendbr(int type, struct instruction *insn) { if (type != ANNOTYPE_NOENDBR) - return; - - insn->noendbr = 1; -} - -static int read_retpoline_hints(struct objtool_file *file) -{ - struct section *rsec; - struct instruction *insn; - struct reloc *reloc; - - rsec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe"); - if (!rsec) return 0; - for_each_reloc(rsec, reloc) { - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } + insn->noendbr = 1; + return 0; +} - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.retpoline_safe entry"); - return -1; - } +static int __annotate_retpoline_safe(int type, struct instruction *insn) +{ + if (type != ANNOTYPE_RETPOLINE_SAFE) + return 0; - if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC && - insn->type != INSN_RETURN && - insn->type != INSN_NOP) { - WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop"); - return -1; - } - - insn->retpoline_safe = true; + if (insn->type != INSN_JUMP_DYNAMIC && + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN && + insn->type != INSN_NOP) { + WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop"); + return -1; } + insn->retpoline_safe = true; return 0; } @@ -2742,7 +2726,7 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - ret = read_retpoline_hints(file); + ret = read_annotate(file, __annotate_retpoline_safe); if (ret) return ret; From 317f2a64618c528539d17fe6957a64106087fbd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:55 +0100 Subject: [PATCH 035/224] objtool: Convert instrumentation_{begin,end}() to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.245980207@infradead.org --- include/linux/instrumentation.h | 11 +++--- include/linux/objtool.h | 12 +++++-- include/linux/objtool_types.h | 2 ++ tools/include/linux/objtool_types.h | 2 ++ tools/objtool/check.c | 53 +++++++---------------------- 5 files changed, 30 insertions(+), 50 deletions(-) diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index bc7babe91b2e..c8f866cf02d8 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -4,14 +4,14 @@ #ifdef CONFIG_NOINSTR_VALIDATION +#include #include /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_begin\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,9 +48,8 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_end\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) #else /* !CONFIG_NOINSTR_VALIDATION */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b5e9c0ab4048..89c67cd7eebe 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -51,13 +51,18 @@ ".long 998b\n\t" \ ".popsection\n\t" -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ +#define __ASM_BREF(label) label ## b + +#define __ASM_ANNOTATE(label, type) \ ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long 911b - .\n\t" \ + ".long " __stringify(label) " - .\n\t" \ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + __ASM_ANNOTATE(911b, type) + #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) #else /* __ASSEMBLY__ */ @@ -161,6 +166,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 1b348361ad1d..d4d68dd36f7a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -59,5 +59,7 @@ struct unwind_hint { */ #define ANNOTYPE_NOENDBR 1 #define ANNOTYPE_RETPOLINE_SAFE 2 +#define ANNOTYPE_INSTR_BEGIN 3 +#define ANNOTYPE_INSTR_END 4 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index 1b348361ad1d..d4d68dd36f7a 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -59,5 +59,7 @@ struct unwind_hint { */ #define ANNOTYPE_NOENDBR 1 #define ANNOTYPE_RETPOLINE_SAFE 2 +#define ANNOTYPE_INSTR_BEGIN 3 +#define ANNOTYPE_INSTR_END 4 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index c5b52309b80d..8e39c7f484d8 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2440,48 +2440,19 @@ static int __annotate_retpoline_safe(int type, struct instruction *insn) return 0; } -static int read_instr_hints(struct objtool_file *file) +static int __annotate_instr(int type, struct instruction *insn) { - struct section *rsec; - struct instruction *insn; - struct reloc *reloc; - - rsec = find_section_by_name(file->elf, ".rela.discard.instr_end"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.instr_end entry"); - return -1; - } - - insn->instr--; - } - - rsec = find_section_by_name(file->elf, ".rela.discard.instr_begin"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.instr_begin entry"); - return -1; - } - + switch (type) { + case ANNOTYPE_INSTR_BEGIN: insn->instr++; + break; + + case ANNOTYPE_INSTR_END: + insn->instr--; + break; + + default: + break; } return 0; @@ -2730,7 +2701,7 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - ret = read_instr_hints(file); + ret = read_annotate(file, __annotate_instr); if (ret) return ret; From 18aa6118a1689b4d73c5ebbd917ae3f20c9c0db1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:56 +0100 Subject: [PATCH 036/224] objtool: Convert VALIDATE_UNRET_BEGIN to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.358508242@infradead.org --- include/linux/objtool.h | 9 +++------ include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 28 +++++----------------------- 4 files changed, 10 insertions(+), 29 deletions(-) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 89c67cd7eebe..5f0bf8052dc7 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -130,15 +130,12 @@ * NOTE: The macro must be used at the beginning of a global symbol, otherwise * it will be ignored. */ -.macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -.Lhere_\@: - .pushsection .discard.validate_unret - .long .Lhere_\@ - . - .popsection +#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN #endif -.endm .macro REACHABLE .Lhere_\@: diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index d4d68dd36f7a..16236a56364b 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -61,5 +61,6 @@ struct unwind_hint { #define ANNOTYPE_RETPOLINE_SAFE 2 #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 +#define ANNOTYPE_UNRET_BEGIN 5 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index d4d68dd36f7a..16236a56364b 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -61,5 +61,6 @@ struct unwind_hint { #define ANNOTYPE_RETPOLINE_SAFE 2 #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 +#define ANNOTYPE_UNRET_BEGIN 5 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 8e39c7f484d8..2a703748cad1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2458,33 +2458,15 @@ static int __annotate_instr(int type, struct instruction *insn) return 0; } -static int read_validate_unret_hints(struct objtool_file *file) +static int __annotate_unret(int type, struct instruction *insn) { - struct section *rsec; - struct instruction *insn; - struct reloc *reloc; - - rsec = find_section_by_name(file->elf, ".rela.discard.validate_unret"); - if (!rsec) + if (type != ANNOTYPE_UNRET_BEGIN) return 0; - for_each_reloc(rsec, reloc) { - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.instr_end entry"); - return -1; - } - insn->unret = 1; - } - + insn->unret = 1; return 0; -} +} static int read_intra_function_calls(struct objtool_file *file) { @@ -2705,7 +2687,7 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - ret = read_validate_unret_hints(file); + ret = read_annotate(file, __annotate_unret); if (ret) return ret; From f0cd57c35a75f152d3b31b9be3f7f413b96a6d3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:57 +0100 Subject: [PATCH 037/224] objtool: Convert ANNOTATE_IGNORE_ALTERNATIVE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.465691316@infradead.org --- arch/x86/include/asm/alternative.h | 14 ++------- include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 45 +++++++---------------------- 4 files changed, 15 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index dc03a647776d..595695f85f80 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,6 +4,7 @@ #include #include +#include #include #define ALT_FLAGS_SHIFT 16 @@ -58,11 +59,7 @@ * objtool annotation to ignore the alternatives and only consider the original * instruction(s). */ -#define ANNOTATE_IGNORE_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b\n\t" \ - ".popsection\n\t" +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) /* * The patching flags are part of the upper bits of the @ft_flags parameter when @@ -314,12 +311,7 @@ void nop_func(void); * objtool annotation to ignore the alternatives and only consider the original * instruction(s). */ -.macro ANNOTATE_IGNORE_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.ignore_alts - .long .Lannotate_\@ - .popsection -.endm +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS /* * Issue one struct alt_instr descriptor entry (need to put it into diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 16236a56364b..eab15dbe1cb7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -62,5 +62,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 +#define ANNOTYPE_IGNORE_ALTS 6 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index 16236a56364b..eab15dbe1cb7 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -62,5 +62,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 +#define ANNOTYPE_IGNORE_ALTS 6 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2a703748cad1..ba2cb9b69399 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1309,40 +1309,6 @@ static void add_uaccess_safe(struct objtool_file *file) } } -/* - * FIXME: For now, just ignore any alternatives which add retpolines. This is - * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline. - * But it at least allows objtool to understand the control flow *around* the - * retpoline. - */ -static int add_ignore_alternatives(struct objtool_file *file) -{ - struct section *rsec; - struct reloc *reloc; - struct instruction *insn; - - rsec = find_section_by_name(file->elf, ".rela.discard.ignore_alts"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.ignore_alts entry"); - return -1; - } - - insn->ignore_alts = true; - } - - return 0; -} - /* * Symbols that replace INSN_CALL_DYNAMIC, every (tail) call to such a symbol * will be added to the .retpoline_sites section. @@ -2414,6 +2380,15 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct return 0; } +static int __annotate_ignore_alts(int type, struct instruction *insn) +{ + if (type != ANNOTYPE_IGNORE_ALTS) + return 0; + + insn->ignore_alts = true; + return 0; +} + static int __annotate_noendbr(int type, struct instruction *insn) { if (type != ANNOTYPE_NOENDBR) @@ -2626,7 +2601,7 @@ static int decode_sections(struct objtool_file *file) add_ignores(file); add_uaccess_safe(file); - ret = add_ignore_alternatives(file); + ret = read_annotate(file, __annotate_ignore_alts); if (ret) return ret; From 112765ca1cb9353e71b4f5af4e6e6c4a69c28d99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:58 +0100 Subject: [PATCH 038/224] objtool: Convert ANNOTATE_INTRA_FUNCTION_CALL to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.584892071@infradead.org --- include/linux/objtool.h | 16 ++--- include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 96 ++++++++++++----------------- 4 files changed, 47 insertions(+), 67 deletions(-) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 5f0bf8052dc7..42287c1e32ce 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -67,16 +67,6 @@ #else /* __ASSEMBLY__ */ -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL \ - 999: \ - .pushsection .discard.intra_function_calls; \ - .long 999b; \ - .popsection; - /* * In asm, there are two kinds of code: normal C-type callable functions and * the rest. The normal callable functions can be called by other code, and @@ -154,6 +144,12 @@ #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index eab15dbe1cb7..23d6fb6d04c7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -63,5 +63,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 +#define ANNOTYPE_INTRA_FUNCTION_CALL 7 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index eab15dbe1cb7..23d6fb6d04c7 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -63,5 +63,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 +#define ANNOTYPE_INTRA_FUNCTION_CALL 7 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ba2cb9b69399..2222fe710832 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2339,7 +2339,8 @@ static int read_unwind_hints(struct objtool_file *file) return 0; } -static int read_annotate(struct objtool_file *file, int (*func)(int type, struct instruction *insn)) +static int read_annotate(struct objtool_file *file, + int (*func)(struct objtool_file *file, int type, struct instruction *insn)) { struct section *sec; struct instruction *insn; @@ -2372,7 +2373,7 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct return -1; } - ret = func(type, insn); + ret = func(file, type, insn); if (ret < 0) return ret; } @@ -2380,7 +2381,7 @@ static int read_annotate(struct objtool_file *file, int (*func)(int type, struct return 0; } -static int __annotate_ignore_alts(int type, struct instruction *insn) +static int __annotate_ignore_alts(struct objtool_file *file, int type, struct instruction *insn) { if (type != ANNOTYPE_IGNORE_ALTS) return 0; @@ -2389,7 +2390,7 @@ static int __annotate_ignore_alts(int type, struct instruction *insn) return 0; } -static int __annotate_noendbr(int type, struct instruction *insn) +static int __annotate_noendbr(struct objtool_file *file, int type, struct instruction *insn) { if (type != ANNOTYPE_NOENDBR) return 0; @@ -2398,7 +2399,37 @@ static int __annotate_noendbr(int type, struct instruction *insn) return 0; } -static int __annotate_retpoline_safe(int type, struct instruction *insn) +static int __annotate_ifc(struct objtool_file *file, int type, struct instruction *insn) +{ + unsigned long dest_off; + + if (type != ANNOTYPE_INTRA_FUNCTION_CALL) + return 0; + + if (insn->type != INSN_CALL) { + WARN_INSN(insn, "intra_function_call not a direct call"); + return -1; + } + + /* + * Treat intra-function CALLs as JMPs, but with a stack_op. + * See add_call_destinations(), which strips stack_ops from + * normal CALLs. + */ + insn->type = INSN_JUMP_UNCONDITIONAL; + + dest_off = arch_jump_destination(insn); + insn->jump_dest = find_insn(file, insn->sec, dest_off); + if (!insn->jump_dest) { + WARN_INSN(insn, "can't find call dest at %s+0x%lx", + insn->sec->name, dest_off); + return -1; + } + + return 0; +} + +static int __annotate_retpoline_safe(struct objtool_file *file, int type, struct instruction *insn) { if (type != ANNOTYPE_RETPOLINE_SAFE) return 0; @@ -2415,7 +2446,7 @@ static int __annotate_retpoline_safe(int type, struct instruction *insn) return 0; } -static int __annotate_instr(int type, struct instruction *insn) +static int __annotate_instr(struct objtool_file *file, int type, struct instruction *insn) { switch (type) { case ANNOTYPE_INSTR_BEGIN: @@ -2433,7 +2464,7 @@ static int __annotate_instr(int type, struct instruction *insn) return 0; } -static int __annotate_unret(int type, struct instruction *insn) +static int __annotate_unret(struct objtool_file *file, int type, struct instruction *insn) { if (type != ANNOTYPE_UNRET_BEGIN) return 0; @@ -2443,55 +2474,6 @@ static int __annotate_unret(int type, struct instruction *insn) } -static int read_intra_function_calls(struct objtool_file *file) -{ - struct instruction *insn; - struct section *rsec; - struct reloc *reloc; - - rsec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - unsigned long dest_off; - - if (reloc->sym->type != STT_SECTION) { - WARN("unexpected relocation symbol type in %s", - rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); - if (!insn) { - WARN("bad .discard.intra_function_call entry"); - return -1; - } - - if (insn->type != INSN_CALL) { - WARN_INSN(insn, "intra_function_call not a direct call"); - return -1; - } - - /* - * Treat intra-function CALLs as JMPs, but with a stack_op. - * See add_call_destinations(), which strips stack_ops from - * normal CALLs. - */ - insn->type = INSN_JUMP_UNCONDITIONAL; - - dest_off = arch_jump_destination(insn); - insn->jump_dest = find_insn(file, insn->sec, dest_off); - if (!insn->jump_dest) { - WARN_INSN(insn, "can't find call dest at %s+0x%lx", - insn->sec->name, dest_off); - return -1; - } - } - - return 0; -} - /* * Return true if name matches an instrumentation function, where calls to that * function from noinstr code can safely be removed, but compilers won't do so. @@ -2630,7 +2612,7 @@ static int decode_sections(struct objtool_file *file) * Must be before add_call_destination(); it changes INSN_CALL to * INSN_JUMP. */ - ret = read_intra_function_calls(file); + ret = read_annotate(file, __annotate_ifc); if (ret) return ret; From a8a330dd9900024dc18b048c4f0f3c6ad22ff4c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:59 +0100 Subject: [PATCH 039/224] objtool: Collapse annotate sequences Reduce read_annotate() runs by collapsing subsequent runs into a single call. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.688871544@infradead.org --- tools/objtool/check.c | 89 ++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 56 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2222fe710832..3bea8b2963d3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2381,21 +2381,24 @@ static int read_annotate(struct objtool_file *file, return 0; } -static int __annotate_ignore_alts(struct objtool_file *file, int type, struct instruction *insn) +static int __annotate_early(struct objtool_file *file, int type, struct instruction *insn) { - if (type != ANNOTYPE_IGNORE_ALTS) - return 0; + switch (type) { + case ANNOTYPE_IGNORE_ALTS: + insn->ignore_alts = true; + break; - insn->ignore_alts = true; - return 0; -} + /* + * Must be before read_unwind_hints() since that needs insn->noendbr. + */ + case ANNOTYPE_NOENDBR: + insn->noendbr = 1; + break; -static int __annotate_noendbr(struct objtool_file *file, int type, struct instruction *insn) -{ - if (type != ANNOTYPE_NOENDBR) - return 0; + default: + break; + } - insn->noendbr = 1; return 0; } @@ -2429,26 +2432,21 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio return 0; } -static int __annotate_retpoline_safe(struct objtool_file *file, int type, struct instruction *insn) -{ - if (type != ANNOTYPE_RETPOLINE_SAFE) - return 0; - - if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC && - insn->type != INSN_RETURN && - insn->type != INSN_NOP) { - WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop"); - return -1; - } - - insn->retpoline_safe = true; - return 0; -} - -static int __annotate_instr(struct objtool_file *file, int type, struct instruction *insn) +static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn) { switch (type) { + case ANNOTYPE_RETPOLINE_SAFE: + if (insn->type != INSN_JUMP_DYNAMIC && + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN && + insn->type != INSN_NOP) { + WARN_INSN(insn, "retpoline_safe hint not an indirect jump/call/ret/nop"); + return -1; + } + + insn->retpoline_safe = true; + break; + case ANNOTYPE_INSTR_BEGIN: insn->instr++; break; @@ -2457,6 +2455,10 @@ static int __annotate_instr(struct objtool_file *file, int type, struct instruct insn->instr--; break; + case ANNOTYPE_UNRET_BEGIN: + insn->unret = 1; + break; + default: break; } @@ -2464,16 +2466,6 @@ static int __annotate_instr(struct objtool_file *file, int type, struct instruct return 0; } -static int __annotate_unret(struct objtool_file *file, int type, struct instruction *insn) -{ - if (type != ANNOTYPE_UNRET_BEGIN) - return 0; - - insn->unret = 1; - return 0; - -} - /* * Return true if name matches an instrumentation function, where calls to that * function from noinstr code can safely be removed, but compilers won't do so. @@ -2583,14 +2575,7 @@ static int decode_sections(struct objtool_file *file) add_ignores(file); add_uaccess_safe(file); - ret = read_annotate(file, __annotate_ignore_alts); - if (ret) - return ret; - - /* - * Must be before read_unwind_hints() since that needs insn->noendbr. - */ - ret = read_annotate(file, __annotate_noendbr); + ret = read_annotate(file, __annotate_early); if (ret) return ret; @@ -2636,15 +2621,7 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - ret = read_annotate(file, __annotate_retpoline_safe); - if (ret) - return ret; - - ret = read_annotate(file, __annotate_instr); - if (ret) - return ret; - - ret = read_annotate(file, __annotate_unret); + ret = read_annotate(file, __annotate_late); if (ret) return ret; From bb8170067470cc7af28e4386e600b1e0a6a8956a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:00 +0100 Subject: [PATCH 040/224] objtool: Collect more annotations in objtool.h Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.786598147@infradead.org --- arch/x86/include/asm/alternative.h | 12 ----- arch/x86/include/asm/nospec-branch.h | 9 ---- include/linux/instrumentation.h | 4 +- include/linux/objtool.h | 80 ++++++++++++++++++---------- 4 files changed, 55 insertions(+), 50 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 595695f85f80..e3903b731305 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -55,12 +55,6 @@ #define LOCK_PREFIX "" #endif -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) - /* * The patching flags are part of the upper bits of the @ft_flags parameter when * specifying them. The split is currently like this: @@ -307,12 +301,6 @@ void nop_func(void); .endm #endif -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS - /* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 50340a125953..7e8bf78c03d5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -179,13 +179,6 @@ #ifdef __ASSEMBLY__ -/* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. - */ -#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE - /* * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions * vs RETBleed validation. @@ -345,8 +338,6 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) - typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index c8f866cf02d8..bf675a8aef8a 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -10,7 +10,7 @@ /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + ANNOTATE_INSTR_BEGIN(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,7 +48,7 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + ANNOTATE_INSTR_END(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 42287c1e32ce..fd487d466bb2 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -63,8 +63,6 @@ "911:\n\t" \ __ASM_ANNOTATE(911b, type) -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) - #else /* __ASSEMBLY__ */ /* @@ -113,19 +111,6 @@ #endif .endm -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#else -#define VALIDATE_UNRET_BEGIN -#endif .macro REACHABLE .Lhere_\@: @@ -142,14 +127,6 @@ .popsection .endm -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR - -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -161,16 +138,12 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ANNOTATE_NOENDBR #define ASM_REACHABLE #else -#define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE_NOENDBR -.endm .macro REACHABLE .endm .macro ANNOTATE type:req @@ -179,4 +152,57 @@ #endif /* CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) + +#else +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#endif + +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) +#define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN +#endif + #endif /* _LINUX_OBJTOOL_H */ From c837de3810982cd41cd70e5170da1931439f025c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:01 +0100 Subject: [PATCH 041/224] unreachable: Unify Since barrier_before_unreachable() is empty for !GCC it is trivial to unify the two definitions. Less is more. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.924381359@infradead.org --- include/linux/compiler-gcc.h | 12 ------------ include/linux/compiler.h | 10 +++++++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d0ed9583743f..c9b58188ec61 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -52,18 +52,6 @@ */ #define barrier_before_unreachable() asm volatile("") -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - */ -#define unreachable() \ - do { \ - annotate_unreachable(); \ - barrier_before_unreachable(); \ - __builtin_unreachable(); \ - } while (0) - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..7be80897a62f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -141,12 +141,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -#ifndef unreachable -# define unreachable() do { \ +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + */ +#define unreachable() do { \ annotate_unreachable(); \ + barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) -#endif /* * KENTRY - kernel entry point From 2190966fbc14ca2cd4ea76eefeb96a47d8e390df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:02 +0100 Subject: [PATCH 042/224] x86: Convert unreachable() to BUG() Avoid unreachable() as it can (and will in the absence of UBSAN) generate fallthrough code. Use BUG() so we get a UD2 trap (with unreachable annotation). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.028316261@infradead.org --- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kvm/svm/sev.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f63f8fd00a91..15507e739c25 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -838,7 +838,7 @@ void __noreturn stop_this_cpu(void *dummy) #ifdef CONFIG_SMP if (smp_ops.stop_this_cpu) { smp_ops.stop_this_cpu(); - unreachable(); + BUG(); } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 615922838c51..dc1dd3f3e67f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -883,7 +883,7 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) if (smp_ops.stop_this_cpu) { smp_ops.stop_this_cpu(); - unreachable(); + BUG(); } /* Assume hlt works */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 943bd074a5d3..fe6cc763fd51 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3820,7 +3820,7 @@ next_range: goto next_range; } - unreachable(); + BUG(); } static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) From 624bde3465f660e54a7cd4c1efc3e536349fead5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:03 +0100 Subject: [PATCH 043/224] loongarch: Use ASM_REACHABLE annotate_reachable() is unreliable since the compiler is free to place random code inbetween two consecutive asm() statements. This removes the last and only annotate_reachable() user. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.133437051@infradead.org --- arch/loongarch/include/asm/bug.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index 08388876ade4..561ac1bf79e2 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -4,6 +4,7 @@ #include #include +#include #ifndef CONFIG_DEBUG_BUGVERBOSE #define _BUGVERBOSE_LOCATION(file, line) @@ -33,25 +34,25 @@ #define ASM_BUG_FLAGS(flags) \ __BUG_ENTRY(flags) \ - break BRK_BUG + break BRK_BUG; #define ASM_BUG() ASM_BUG_FLAGS(0) -#define __BUG_FLAGS(flags) \ - asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags))); +#define __BUG_FLAGS(flags, extra) \ + asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \ + extra); #define __WARN_FLAGS(flags) \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(BUGFLAG_WARNING|(flags)); \ - annotate_reachable(); \ + __BUG_FLAGS(BUGFLAG_WARNING|(flags), ASM_REACHABLE); \ instrumentation_end(); \ } while (0) #define BUG() \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(0); \ + __BUG_FLAGS(0, ""); \ unreachable(); \ } while (0) From 06e24745985c8dd0da18337503afcf2f2fdbdff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:04 +0100 Subject: [PATCH 044/224] objtool: Remove annotate_{,un}reachable() There are no users of annotate_reachable() left. And the annotate_unreachable() usage in unreachable() is plain wrong; it will hide dangerous fall-through code-gen. Remove both. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.235637588@infradead.org --- include/linux/compiler.h | 27 ------------------------- tools/objtool/check.c | 43 ++-------------------------------------- 2 files changed, 2 insertions(+), 68 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7be80897a62f..3d9a0e483e51 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -109,35 +109,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL -/* - * These macros help objtool understand GCC code flow for unreachable code. - * The __COUNTER__ based labels are a hack to make each instance of the macros - * unique, to convince GCC not to merge duplicate inline asm statements. - */ -#define __stringify_label(n) #n - -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - -#define __annotate_unreachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_unreachable() __annotate_unreachable(__COUNTER__) - /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") - #else /* !CONFIG_OBJTOOL */ -#define annotate_reachable() -#define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ @@ -147,7 +121,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * control elsewhere. */ #define unreachable() do { \ - annotate_unreachable(); \ barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3bea8b2963d3..798cff5bffc4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -638,47 +638,8 @@ static int add_dead_ends(struct objtool_file *file) uint64_t offset; /* - * Check for manually annotated dead ends. - */ - rsec = find_section_by_name(file->elf, ".rela.discard.unreachable"); - if (!rsec) - goto reachable; - - for_each_reloc(rsec, reloc) { - if (reloc->sym->type == STT_SECTION) { - offset = reloc_addend(reloc); - } else if (reloc->sym->local_label) { - offset = reloc->sym->offset; - } else { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, offset); - if (insn) - insn = prev_insn_same_sec(file, insn); - else if (offset == reloc->sym->sec->sh.sh_size) { - insn = find_last_insn(file, reloc->sym->sec); - if (!insn) { - WARN("can't find unreachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, offset); - return -1; - } - } else { - WARN("can't find unreachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, offset); - return -1; - } - - insn->dead_end = true; - } - -reachable: - /* - * These manually annotated reachable checks are needed for GCC 4.4, - * where the Linux unreachable() macro isn't supported. In that case - * GCC doesn't know the "ud2" is fatal, so it generates code as if it's - * not a dead end. + * UD2 defaults to being a dead-end, allow them to be annotated for + * non-fatal, eg WARN. */ rsec = find_section_by_name(file->elf, ".rela.discard.reachable"); if (!rsec) From e7a174fb43d24adca066e82d1cb9fdee092d48d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:05 +0100 Subject: [PATCH 045/224] objtool: Convert {.UN}REACHABLE to ANNOTATE Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.353431347@infradead.org --- arch/loongarch/include/asm/bug.h | 2 +- arch/x86/entry/entry_64.S | 4 +- arch/x86/include/asm/bug.h | 2 +- arch/x86/include/asm/irq_stack.h | 2 +- include/linux/objtool.h | 18 +++---- include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 82 ++++++++--------------------- 8 files changed, 36 insertions(+), 76 deletions(-) diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index 561ac1bf79e2..e25404a93882 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -45,7 +45,7 @@ #define __WARN_FLAGS(flags) \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(BUGFLAG_WARNING|(flags), ASM_REACHABLE); \ + __BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE);\ instrumentation_end(); \ } while (0) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1b5be07f8669..9248660ad409 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -311,7 +311,7 @@ SYM_CODE_END(xen_error_entry) call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE jmp error_return .endm @@ -532,7 +532,7 @@ SYM_CODE_START(\asmsym) call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE jmp paranoid_exit diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 806649c7f23d..dd8fb1779d97 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -92,7 +92,7 @@ do { \ do { \ __auto_type __flags = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \ + _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE); \ instrumentation_end(); \ } while (0) diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index b71ad173f877..5455747ed918 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -101,7 +101,7 @@ #define ASM_CALL_ARG0 \ "call %c[__func] \n" \ - ASM_REACHABLE + ANNOTATE_REACHABLE #define ASM_CALL_ARG1 \ "movq %[arg1], %%rdi \n" \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index fd487d466bb2..e3cb13583fba 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -111,14 +111,6 @@ #endif .endm - -.macro REACHABLE -.Lhere_\@: - .pushsection .discard.reachable - .long .Lhere_\@ - .popsection -.endm - .macro ANNOTATE type:req .Lhere_\@: .pushsection .discard.annotate_insn,"M",@progbits,8 @@ -138,14 +130,11 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ASM_REACHABLE #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro REACHABLE -.endm .macro ANNOTATE type:req .endm #endif @@ -187,6 +176,12 @@ * it will be ignored. */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used directly after an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -196,6 +191,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS #define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE #endif #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 23d6fb6d04c7..df5d9fa84dba 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 +#define ANNOTYPE_REACHABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index 23d6fb6d04c7..df5d9fa84dba 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 +#define ANNOTYPE_REACHABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 798cff5bffc4..27d0c4153582 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -627,56 +627,6 @@ static struct instruction *find_last_insn(struct objtool_file *file, return insn; } -/* - * Mark "ud2" instructions and manually annotated dead ends. - */ -static int add_dead_ends(struct objtool_file *file) -{ - struct section *rsec; - struct reloc *reloc; - struct instruction *insn; - uint64_t offset; - - /* - * UD2 defaults to being a dead-end, allow them to be annotated for - * non-fatal, eg WARN. - */ - rsec = find_section_by_name(file->elf, ".rela.discard.reachable"); - if (!rsec) - return 0; - - for_each_reloc(rsec, reloc) { - if (reloc->sym->type == STT_SECTION) { - offset = reloc_addend(reloc); - } else if (reloc->sym->local_label) { - offset = reloc->sym->offset; - } else { - WARN("unexpected relocation symbol type in %s", rsec->name); - return -1; - } - - insn = find_insn(file, reloc->sym->sec, offset); - if (insn) - insn = prev_insn_same_sec(file, insn); - else if (offset == reloc->sym->sec->sh.sh_size) { - insn = find_last_insn(file, reloc->sym->sec); - if (!insn) { - WARN("can't find reachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, offset); - return -1; - } - } else { - WARN("can't find reachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, offset); - return -1; - } - - insn->dead_end = false; - } - - return 0; -} - static int create_static_call_sections(struct objtool_file *file) { struct static_call_site *site; @@ -2306,6 +2256,7 @@ static int read_annotate(struct objtool_file *file, struct section *sec; struct instruction *insn; struct reloc *reloc; + uint64_t offset; int type, ret; sec = find_section_by_name(file->elf, ".discard.annotate_insn"); @@ -2327,8 +2278,19 @@ static int read_annotate(struct objtool_file *file, for_each_reloc(sec->rsec, reloc) { type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4); - insn = find_insn(file, reloc->sym->sec, - reloc->sym->offset + reloc_addend(reloc)); + offset = reloc->sym->offset + reloc_addend(reloc); + insn = find_insn(file, reloc->sym->sec, offset); + + /* + * Reachable annotations are 'funneh' and act on the previous instruction :/ + */ + if (type == ANNOTYPE_REACHABLE) { + if (insn) + insn = prev_insn_same_sec(file, insn); + else if (offset == reloc->sym->sec->sh.sh_size) + insn = find_last_insn(file, reloc->sym->sec); + } + if (!insn) { WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type); return -1; @@ -2420,6 +2382,10 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi insn->unret = 1; break; + case ANNOTYPE_REACHABLE: + insn->dead_end = false; + break; + default: break; } @@ -2566,14 +2532,6 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - /* - * Must be after add_call_destinations() such that it can override - * dead_end_function() marks. - */ - ret = add_dead_ends(file); - if (ret) - return ret; - ret = add_jump_table_alts(file); if (ret) return ret; @@ -2582,6 +2540,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be after add_call_destinations() such that it can override + * dead_end_function() marks. + */ ret = read_annotate(file, __annotate_late); if (ret) return ret; From 87116ae6da034242baf06e799f9f0e2a8ee6a796 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:06 +0100 Subject: [PATCH 046/224] objtool: Fix ANNOTATE_REACHABLE to be a normal annotation Currently REACHABLE is weird for being on the instruction after the instruction it modifies. Since all REACHABLE annotations have an explicit instruction, flip them around. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.494176035@infradead.org --- arch/loongarch/include/asm/bug.h | 2 +- arch/x86/entry/entry_64.S | 5 ++--- arch/x86/include/asm/bug.h | 2 +- arch/x86/include/asm/irq_stack.h | 4 ++-- include/linux/objtool.h | 4 ++-- tools/objtool/check.c | 23 ----------------------- 6 files changed, 8 insertions(+), 32 deletions(-) diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index e25404a93882..f6f254f2c5db 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -45,7 +45,7 @@ #define __WARN_FLAGS(flags) \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE);\ + __BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\ instrumentation_end(); \ } while (0) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9248660ad409..f52dbe0ad93c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -308,10 +308,9 @@ SYM_CODE_END(xen_error_entry) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - call \cfunc - /* For some configurations \cfunc ends up being a noreturn. */ ANNOTATE_REACHABLE + call \cfunc jmp error_return .endm @@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer into first argument */ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ ANNOTATE_REACHABLE + call \cfunc jmp paranoid_exit diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index dd8fb1779d97..e85ac0c7c039 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -92,7 +92,7 @@ do { \ do { \ __auto_type __flags = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE); \ + _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \ instrumentation_end(); \ } while (0) diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 5455747ed918..562a547c29a5 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -100,8 +100,8 @@ } #define ASM_CALL_ARG0 \ - "call %c[__func] \n" \ - ANNOTATE_REACHABLE + "1: call %c[__func] \n" \ + ANNOTATE_REACHABLE(1b) #define ASM_CALL_ARG1 \ "movq %[arg1], %%rdi \n" \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index e3cb13583fba..c722a921165b 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -177,11 +177,11 @@ */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) /* - * This should be used directly after an instruction that is considered + * This should be used to refer to an instruction that is considered * terminating, like a noreturn CALL or UD2 when we know they are not -- eg * WARN using UD2. */ -#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) +#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 27d0c4153582..26bdd3ebf5d2 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -614,19 +614,6 @@ static int init_pv_ops(struct objtool_file *file) return 0; } -static struct instruction *find_last_insn(struct objtool_file *file, - struct section *sec) -{ - struct instruction *insn = NULL; - unsigned int offset; - unsigned int end = (sec->sh.sh_size > 10) ? sec->sh.sh_size - 10 : 0; - - for (offset = sec->sh.sh_size - 1; offset >= end && !insn; offset--) - insn = find_insn(file, sec, offset); - - return insn; -} - static int create_static_call_sections(struct objtool_file *file) { struct static_call_site *site; @@ -2281,16 +2268,6 @@ static int read_annotate(struct objtool_file *file, offset = reloc->sym->offset + reloc_addend(reloc); insn = find_insn(file, reloc->sym->sec, offset); - /* - * Reachable annotations are 'funneh' and act on the previous instruction :/ - */ - if (type == ANNOTYPE_REACHABLE) { - if (insn) - insn = prev_insn_same_sec(file, insn); - else if (offset == reloc->sym->sec->sh.sh_size) - insn = find_last_insn(file, reloc->sym->sec); - } - if (!insn) { WARN("bad .discard.annotate_insn entry: %d of type %d", reloc_idx(reloc), type); return -1; From e7e0eb53c2f0f68fe2577472ce2802a4efd9d7ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:07 +0100 Subject: [PATCH 047/224] objtool: Warn about unknown annotation types Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.611961175@infradead.org --- tools/objtool/check.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 26bdd3ebf5d2..bfb407f3ac96 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2335,6 +2335,10 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn) { switch (type) { + case ANNOTYPE_NOENDBR: + /* early */ + break; + case ANNOTYPE_RETPOLINE_SAFE: if (insn->type != INSN_JUMP_DYNAMIC && insn->type != INSN_CALL_DYNAMIC && @@ -2359,11 +2363,20 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi insn->unret = 1; break; + case ANNOTYPE_IGNORE_ALTS: + /* early */ + break; + + case ANNOTYPE_INTRA_FUNCTION_CALL: + /* ifc */ + break; + case ANNOTYPE_REACHABLE: insn->dead_end = false; break; default: + WARN_INSN(insn, "Unknown annotation type: %d", type); break; } From d387ceb17149fed4d85a1ec01b3d65ae0204060d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 27 Nov 2024 21:00:09 -0500 Subject: [PATCH 048/224] locking/lockdep: Enforce PROVE_RAW_LOCK_NESTING only if ARCH_SUPPORTS_RT Relax the rule to set PROVE_RAW_LOCK_NESTING by default only for arches that supports PREEMPT_RT. For arches that do not support PREEMPT_RT, they will not be forced to address unimportant raw lock nesting issues when they want to enable PROVE_LOCKING. They do have the option to enable it to look for these raw locking nesting problems if they choose to. Suggested-by: Guenter Roeck Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20241128020009.83347-1-longman@redhat.com --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f3d723705879..49a3819d4d7c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1397,9 +1397,9 @@ config PROVE_LOCKING For more details, see Documentation/locking/lockdep-design.rst. config PROVE_RAW_LOCK_NESTING - bool + bool "Enable raw_spinlock - spinlock nesting checks" if !ARCH_SUPPORTS_RT depends on PROVE_LOCKING - default y + default y if ARCH_SUPPORTS_RT help Enable the raw_spinlock vs. spinlock nesting checks which ensure that the lock nesting rules for PREEMPT_RT enabled kernels are From ae5c677729e99b8cb3e6252aaa9b72a92985d203 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:50 -0400 Subject: [PATCH 049/224] sched/core: Remove HK_TYPE_SCHED The HK_TYPE_SCHED housekeeping type is defined but not set anywhere. So any code that try to use HK_TYPE_SCHED are essentially dead code. So remove HK_TYPE_SCHED and any code that use it. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-2-longman@redhat.com --- include/linux/sched/isolation.h | 1 - kernel/sched/fair.c | 14 -------------- kernel/sched/isolation.c | 1 - 3 files changed, 16 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 2b461129d1fa..499d5e480882 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -10,7 +10,6 @@ enum hk_type { HK_TYPE_TIMER, HK_TYPE_RCU, HK_TYPE_MISC, - HK_TYPE_SCHED, HK_TYPE_TICK, HK_TYPE_DOMAIN, HK_TYPE_WQ, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4283c818bbd1..ef302263f5b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12197,9 +12197,6 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set - * anywhere yet. */ static inline int find_new_ilb(void) { @@ -12444,10 +12441,6 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -12667,13 +12660,6 @@ static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) - return; - /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5891e715f00d..5345e11f3d44 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -12,7 +12,6 @@ enum hk_flags { HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), HK_FLAG_RCU = BIT(HK_TYPE_RCU), HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), HK_FLAG_WQ = BIT(HK_TYPE_WQ), From 1174b9344bc7e7989439cad207fcd94eaab028db Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:51 -0400 Subject: [PATCH 050/224] sched/isolation: Make "isolcpus=nohz" equivalent to "nohz_full" The "isolcpus=nohz" boot parameter and flag were used to disable tick when running a single task. Nowsdays, this "nohz" flag is seldomly used as it is included as part of the "nohz_full" parameter. Extend this flag to cover other kernel noises disabled by the "nohz_full" parameter to make them equivalent. This also eliminates the need to use both the "isolcpus" and the "nohz_full" parameters to fully isolated a given set of CPUs. Suggested-by: Frederic Weisbecker Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-3-longman@redhat.com --- Documentation/admin-guide/kernel-parameters.txt | 4 +++- kernel/sched/isolation.c | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3872bc6ec49d..3fa0b4e65275 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2432,7 +2432,9 @@ specified in the flag list (default: domain): nohz - Disable the tick when a single task runs. + Disable the tick when a single task runs as well as + disabling other kernel noises like having RCU callbacks + offloaded. This is equivalent to the nohz_full parameter. A residual 1Hz tick is offloaded to workqueues, which you need to affine to housekeeping through the global diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5345e11f3d44..6a686322ce3c 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -209,9 +209,13 @@ static int __init housekeeping_isolcpus_setup(char *str) int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | + HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; continue; } From 6010d245ddc9f463bbf0311ac49073a78f444755 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:52 -0400 Subject: [PATCH 051/224] sched/isolation: Consolidate housekeeping cpumasks that are always identical The housekeeping cpumasks are only set by two boot commandline parameters: "nohz_full" and "isolcpus". When there is more than one of "nohz_full" or "isolcpus", the extra ones must have the same CPU list or the setup will fail partially. The HK_TYPE_DOMAIN and HK_TYPE_MANAGED_IRQ types are settable by "isolcpus" only and their settings can be independent of the other types. The other housekeeping types are all set by "nohz_full" or "isolcpus=nohz" without a way to set them individually. So they all have identical cpumasks. There is actually no point in having different cpumasks for these "nohz_full" only housekeeping types. Consolidate these types to use the same cpumask by aliasing them to the same value. If there is a need to set any of them independently in the future, we can break them out to their own cpumasks again. With this change, the number of cpumasks in the housekeeping structure drops from 9 to 3. Other than that, there should be no other functional change. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-4-longman@redhat.com --- include/linux/sched/isolation.h | 20 +++++++++++++------- kernel/sched/isolation.c | 19 ++++++------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 499d5e480882..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,15 +7,21 @@ #include enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, - HK_TYPE_MAX + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 6a686322ce3c..81bc8b329ef1 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,14 +9,9 @@ */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -96,7 +91,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { @@ -120,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) unsigned int first_cpu; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -176,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags; @@ -194,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -214,8 +208,7 @@ static int __init housekeeping_isolcpus_setup(char *str) */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | - HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags |= HK_FLAG_KERNEL_NOISE; continue; } From c907cd44a108eff7005a2b5689bb91f50637df8b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:53 -0400 Subject: [PATCH 052/224] sched: Unify HK_TYPE_{TIMER|TICK|MISC} to HK_TYPE_KERNEL_NOISE As all the non-domain and non-managed_irq housekeeping types have been unified to HK_TYPE_KERNEL_NOISE, replace all these references in the scheduler to use HK_TYPE_KERNEL_NOISE. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-5-longman@redhat.com --- kernel/sched/core.c | 12 ++++++------ kernel/sched/fair.c | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1dee3f5ef940..5fbec67d48b2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1170,13 +1170,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); guard(rcu)(); @@ -1191,7 +1191,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return default_cpu; } @@ -5634,7 +5634,7 @@ void sched_tick(void) unsigned long hw_pressure; u64 resched_latency; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) arch_scale_freq_tick(); sched_clock_tick(); @@ -5773,7 +5773,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5794,7 +5794,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef302263f5b3..d5127d9beaea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12203,7 +12203,7 @@ static inline int find_new_ilb(void) const struct cpumask *hk_mask; int ilb_cpu; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { @@ -12221,7 +12221,8 @@ static inline int find_new_ilb(void) * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * - * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { From c3cb6c158c64dc39838208d51dcd06d1990b371d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 11 Oct 2024 19:08:50 +0200 Subject: [PATCH 053/224] objtool: Allow arch code to discover jump table size In preparation for adding support for annotated jump tables, where ELF relocations and symbols are used to describe the locations of jump tables in the executable, refactor the jump table discovery logic so the table size can be returned from arch_find_switch_table(). Signed-off-by: Ard Biesheuvel Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241011170847.334429-12-ardb+git@google.com --- tools/objtool/arch/loongarch/special.c | 3 ++- tools/objtool/arch/powerpc/special.c | 3 ++- tools/objtool/arch/x86/special.c | 4 +++- tools/objtool/check.c | 31 ++++++++++++++++--------- tools/objtool/include/objtool/check.h | 5 +++- tools/objtool/include/objtool/special.h | 3 ++- 6 files changed, 33 insertions(+), 16 deletions(-) diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index 9bba1e9318e0..87230ed570fd 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -9,7 +9,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, } struct reloc *arch_find_switch_table(struct objtool_file *file, - struct instruction *insn) + struct instruction *insn, + unsigned long *table_size) { return NULL; } diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c index d33868147196..51610689abf7 100644 --- a/tools/objtool/arch/powerpc/special.c +++ b/tools/objtool/arch/powerpc/special.c @@ -13,7 +13,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, } struct reloc *arch_find_switch_table(struct objtool_file *file, - struct instruction *insn) + struct instruction *insn, + unsigned long *table_size) { exit(-1); } diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 4ea0f9815fda..9c1c9df09aaa 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -109,7 +109,8 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, * NOTE: MITIGATION_RETPOLINE made it harder still to decode dynamic jumps. */ struct reloc *arch_find_switch_table(struct objtool_file *file, - struct instruction *insn) + struct instruction *insn, + unsigned long *table_size) { struct reloc *text_reloc, *rodata_reloc; struct section *table_sec; @@ -158,5 +159,6 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, if (reloc_type(text_reloc) == R_X86_64_PC32) file->ignore_unreachables = true; + *table_size = 0; return rodata_reloc; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index bfb407f3ac96..e92c5564d9ca 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -150,6 +150,15 @@ static inline struct reloc *insn_jump_table(struct instruction *insn) return NULL; } +static inline unsigned long insn_jump_table_size(struct instruction *insn) +{ + if (insn->type == INSN_JUMP_DYNAMIC || + insn->type == INSN_CALL_DYNAMIC) + return insn->_jump_table_size; + + return 0; +} + static bool is_jump_table_jump(struct instruction *insn) { struct alt_group *alt_group = insn->alt_group; @@ -1937,6 +1946,7 @@ out: static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct reloc *next_table) { + unsigned long table_size = insn_jump_table_size(insn); struct symbol *pfunc = insn_func(insn)->pfunc; struct reloc *table = insn_jump_table(insn); struct instruction *dest_insn; @@ -1951,6 +1961,8 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, for_each_reloc_from(table->sec, reloc) { /* Check for the end of the table: */ + if (table_size && reloc_offset(reloc) - reloc_offset(table) >= table_size) + break; if (reloc != table && reloc == next_table) break; @@ -1995,12 +2007,12 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * find_jump_table() - Given a dynamic jump, find the switch jump table * associated with it. */ -static struct reloc *find_jump_table(struct objtool_file *file, - struct symbol *func, - struct instruction *insn) +static void find_jump_table(struct objtool_file *file, struct symbol *func, + struct instruction *insn) { struct reloc *table_reloc; struct instruction *dest_insn, *orig_insn = insn; + unsigned long table_size; /* * Backward search using the @first_jump_src links, these help avoid @@ -2021,17 +2033,17 @@ static struct reloc *find_jump_table(struct objtool_file *file, insn->jump_dest->offset > orig_insn->offset)) break; - table_reloc = arch_find_switch_table(file, insn); + table_reloc = arch_find_switch_table(file, insn, &table_size); if (!table_reloc) continue; dest_insn = find_insn(file, table_reloc->sym->sec, reloc_addend(table_reloc)); if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func) continue; - return table_reloc; + orig_insn->_jump_table = table_reloc; + orig_insn->_jump_table_size = table_size; + break; } - - return NULL; } /* @@ -2042,7 +2054,6 @@ static void mark_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *last = NULL; - struct reloc *reloc; func_for_each_insn(file, func, insn) { if (!last) @@ -2065,9 +2076,7 @@ static void mark_func_jump_tables(struct objtool_file *file, if (insn->type != INSN_JUMP_DYNAMIC) continue; - reloc = find_jump_table(file, func, insn); - if (reloc) - insn->_jump_table = reloc; + find_jump_table(file, func, insn); } } diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index daa46f1f0965..e1cd13cd28a3 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -71,7 +71,10 @@ struct instruction { struct instruction *first_jump_src; union { struct symbol *_call_dest; - struct reloc *_jump_table; + struct { + struct reloc *_jump_table; + unsigned long _jump_table_size; + }; }; struct alternative *alts; struct symbol *sym; diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index 86d4af9c5aa9..e7ee7ffccefd 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -38,5 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, struct instruction *insn, struct reloc *reloc); struct reloc *arch_find_switch_table(struct objtool_file *file, - struct instruction *insn); + struct instruction *insn, + unsigned long *table_size); #endif /* _SPECIAL_H */ From 5daececd4ff533ab316ab360aba0bda1bf01961d Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 30 Nov 2024 13:26:44 +0100 Subject: [PATCH 054/224] x86/boot/compressed: Remove unused header includes from kaslr.c Nothing is using the linux/ namespace headers anymore. Remove them. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241130122644.GAZ0sEhD3Bm_9ZAIuc@fat_crate.local --- arch/x86/boot/compressed/kaslr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f4d82379bf44..f03d59ea6e40 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -25,10 +25,6 @@ #include "efi.h" #include -#include -#include -#include -#include #include #include From 6f8b79683dfb37ee0661cf4c13a72f024c29f65c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Nov 2024 12:42:34 +0200 Subject: [PATCH 055/224] genirq: Move irq_thread_fn() further up in the code In a preparation to reuse irq_thread_fn() move it further up in the code. No functional change intended. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241119104339.2112455-2-andriy.shevchenko@linux.intel.com --- kernel/irq/manage.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0803d6bd296..230f4701f18e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1181,14 +1181,29 @@ out_unlock: chip_bus_sync_unlock(desc); } +/* + * Interrupts explicitly requested as threaded interrupts want to be + * preemptible - many of them need to sleep and wait for slow busses to + * complete. + */ +static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); + + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + + irq_finalize_oneshot(desc, action); + return ret; +} + /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ -static irqreturn_t -irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; @@ -1206,24 +1221,6 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) return ret; } -/* - * Interrupts explicitly requested as threaded interrupts want to be - * preemptible - many of them need to sleep and wait for slow busses to - * complete. - */ -static irqreturn_t irq_thread_fn(struct irq_desc *desc, - struct irqaction *action) -{ - irqreturn_t ret; - - ret = action->thread_fn(action->irq, action->dev_id); - if (ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); - - irq_finalize_oneshot(desc, action); - return ret; -} - void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) From 429f49ad361cd999ca221d8b562ae2552b7c3e2c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Nov 2024 12:42:35 +0200 Subject: [PATCH 056/224] genirq: Reuse irq_thread_fn() for forced thread case rq_forced_thread_fn() uses the same action callback as the non-forced variant but with different locking decorations. Reuse irq_thread_fn() here to make that clear. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241119104339.2112455-3-andriy.shevchenko@linux.intel.com --- kernel/irq/manage.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 230f4701f18e..f300bb6be3bd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1210,11 +1210,7 @@ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); - ret = action->thread_fn(action->irq, action->dev_id); - if (ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); - - irq_finalize_oneshot(desc, action); + ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); From 6a5abeea9c72e1d2c538622b4cf66c80cc816fd3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 2 Dec 2024 09:31:39 +0200 Subject: [PATCH 057/224] x86/mtrr: Rename mtrr_overwrite_state() to guest_force_mtrr_state() Rename the helper to better reflect its function. Suggested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Acked-by: Dave Hansen Link: https://lore.kernel.org/all/20241202073139.448208-1-kirill.shutemov%40linux.intel.com --- arch/x86/hyperv/ivm.c | 2 +- arch/x86/include/asm/mtrr.h | 10 +++++----- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- arch/x86/kernel/cpu/mtrr/mtrr.c | 2 +- arch/x86/kernel/kvm.c | 2 +- arch/x86/xen/enlighten_pv.c | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 60fc3ed72830..90aabe1fd3b6 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -664,7 +664,7 @@ void __init hv_vtom_init(void) x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; /* Set WB as the default cache mode. */ - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 4218248083d9..c69e269937c5 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -58,8 +58,8 @@ struct mtrr_state_type { */ # ifdef CONFIG_MTRR void mtrr_bp_init(void); -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type); +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -75,9 +75,9 @@ void mtrr_disable(void); void mtrr_enable(void); void mtrr_generic_set_state(void); # else -static inline void mtrr_overwrite_state(struct mtrr_var_range *var, - unsigned int num_var, - mtrr_type def_type) +static inline void guest_force_mtrr_state(struct mtrr_var_range *var, + unsigned int num_var, + mtrr_type def_type) { } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b29ebda024f..2fdfda2b60e4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -423,7 +423,7 @@ void __init mtrr_copy_map(void) } /** - * mtrr_overwrite_state - set static MTRR state + * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). @@ -436,8 +436,8 @@ void __init mtrr_copy_map(void) * @num_var: length of the @var array * @def_type: default caching type */ -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type) +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) { unsigned int i; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 989d368be04f..ecbda0341a8a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -625,7 +625,7 @@ void mtrr_save_state(void) static int __init mtrr_init_finalize(void) { /* - * Map might exist if mtrr_overwrite_state() has been called or if + * Map might exist if guest_force_mtrr_state() has been called or if * mtrr_enabled() returns true. */ mtrr_copy_map(); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 21e9e4845354..7a422a6c5983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -983,7 +983,7 @@ static void __init kvm_init_platform(void) x86_platform.apic_post_init = kvm_apic_init; /* Set WB as the default cache mode for SEV-SNP and TDX */ - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda..633469fab536 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -171,7 +171,7 @@ static void __init xen_set_mtrr_data(void) /* Only overwrite MTRR state if any MTRR could be got from Xen. */ if (reg) - mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); + guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); #endif } @@ -195,7 +195,7 @@ static void __init xen_pv_init_platform(void) if (xen_initial_domain()) xen_set_mtrr_data(); else - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); /* Adjust nr_cpu_ids before "enumeration" happens */ xen_smp_count_cpus(); From cd9ce8217345bd13035a0d3edaaecec4244d0ddd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 2 Dec 2024 09:24:31 +0200 Subject: [PATCH 058/224] x86/tdx: Disable unnecessary virtualization exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Originally, #VE was defined as the TDX behavior in order to support paravirtualization of x86 features that can’t be virtualized by the TDX module. The intention is that if guest software wishes to use such a feature, it implements some logic to support this. This logic resides in the #VE exception handler it may work in cooperation with the host VMM. Theoretically, the guest TD’s #VE handler was supposed to act as a "TDX enlightenment agent" inside the TD. However, in practice, the #VE handler is simplistic: - #VE on CPUID is handled by returning all-0 to the code which executed CPUID. In many cases, an all-0 value is not the correct value, and may cause improper operation. - #VE on RDMSR is handled by requesting the MSR value from the host VMM. This is prone to security issues since the host VMM is untrusted. It may also be functionally incorrect in case the expected operation is to paravirtualize some CPU functionality. Newer TDX modules provide a "REDUCE_VE" feature. When enabled, it drastically cuts cases when guests receive #VE on MSR and CPUID accesses. Basically, instead of punting the problem to the VMM, the TDX module fills in good data. What the TDX module provides is obviously highly specific to the MSR or CPUID. This is all spelled out in excruciating detail in the TDX specs. Enable REDUCE_VE. Make TDX guest behaviour less odd, and closer to how a normal CPU behaves. Note that enabling of the feature doesn't eliminate need in #VE handler for CPUID and MSR accesses. Some MSRs still generate #VE (notably APIC-related) and kernel needs CPUID #VE handler to ask VMM for leafs in hypervisor range. [ dhansen: changelog tweaks, rename/rework VE reduction function ] Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/all/20241202072431.447380-1-kirill.shutemov%40linux.intel.com --- arch/x86/coco/tdx/tdx.c | 17 ++++++++++++++++- arch/x86/include/asm/shared/tdx.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 0d9b090b4880..c0ebe8cc147e 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -274,6 +274,20 @@ static void enable_cpu_topology_enumeration(void) tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY); } +static void reduce_unnecessary_ve(void) +{ + u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE); + + if (err == TDX_SUCCESS) + return; + + /* + * Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to + * enable ENUM_TOPOLOGY if REDUCE_VE was not successful. + */ + enable_cpu_topology_enumeration(); +} + static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {}; @@ -305,7 +319,8 @@ static void tdx_setup(u64 *cc_mask) tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL); disable_sept_ve(td_attr); - enable_cpu_topology_enumeration(); + + reduce_unnecessary_ve(); } /* diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 89f7fcade8ae..a878c7e8347b 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -31,6 +31,7 @@ /* TDCS_TD_CTLS bits */ #define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0) #define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1) +#define TD_CTLS_REDUCE_VE BIT_ULL(3) /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 From 09d35045cd0f4265cf1dfe18ef83285fdc294688 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:06 +0100 Subject: [PATCH 059/224] x86/sev: Avoid WARN()s and panic()s in early boot code Using WARN() or panic() while executing from the early 1:1 mapping is unlikely to do anything useful: the string literals are passed using their kernel virtual addresses which are not even mapped yet. But even if they were, calling into the printk() machinery from the early 1:1 mapped code is not going to get very far. So drop the WARN()s entirely, and replace panic() with a deadloop. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-10-ardb+git@google.com --- arch/x86/coco/sev/core.c | 15 +++++---------- arch/x86/coco/sev/shared.c | 9 +++++---- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..499b41953e3c 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -777,15 +777,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, val = sev_es_rd_ghcb_msr(); - if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, - "Wrong PSC response code: 0x%x\n", - (unsigned int)GHCB_RESP_CODE(val))) + if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) goto e_term; - if (WARN(GHCB_MSR_PSC_RESP_VAL(val), - "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", - op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", - paddr, GHCB_MSR_PSC_RESP_VAL(val))) + if (GHCB_MSR_PSC_RESP_VAL(val)) goto e_term; /* Page validation must be performed after changing to private */ @@ -821,7 +816,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); } -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { /* @@ -2361,8 +2356,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out); + while (ret) + cpu_relax(); /* too early to panic */ RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; RIP_REL_REF(boot_svsm_caa_pa) = pa; diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/coco/sev/shared.c index 71de53194089..afb7ffc355fe 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/coco/sev/shared.c @@ -1243,7 +1243,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs __pval_terminate(pfn, action, page_size, ret, svsm_ret); } -static void svsm_pval_4k_page(unsigned long paddr, bool validate) +static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -1275,12 +1275,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate) ret = svsm_perform_call_protocol(&call); if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate) +static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate) { int ret; @@ -1293,7 +1294,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) - __pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); } } From 093562198e1a6360672954293753f4c6cb9a3316 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:07 +0100 Subject: [PATCH 060/224] x86/boot/64: Determine VA/PA offset before entering C code Implicit absolute symbol references (e.g., taking the address of a global variable) must be avoided in the C code that runs from the early 1:1 mapping of the kernel, given that this is a practice that violates assumptions on the part of the toolchain. I.e., RIP-relative and absolute references are expected to produce the same values, and so the compiler is free to choose either. However, the code currently assumes that RIP-relative references are never emitted here. So an explicit virtual-to-physical offset needs to be used instead to derive the kernel virtual addresses of _text and _end, instead of simply taking the addresses and assuming that the compiler will not choose to use a RIP-relative references in this particular case. Currently, phys_base is already used to perform such calculations, but it is derived from the kernel virtual address of _text, which is taken using an implicit absolute symbol reference. So instead, derive this VA-to-PA offset in asm code, and pass it to the C startup code. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-11-ardb+git@google.com --- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/head64.c | 8 +++++--- arch/x86/kernel/head_64.S | 12 +++++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 0667b2a88614..85f4fde3515c 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -49,7 +49,7 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); -extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); +extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4b9d4557fc94..a7cd4053eeb3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -138,12 +138,14 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). + * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined + * by subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long physaddr, +unsigned long __head __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); unsigned long pgtable_flags; unsigned long load_delta; pgdval_t *pgd; @@ -163,7 +165,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * Compute the delta between the address I am compiled to run at * and the address I am actually running at. */ - load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + load_delta = __START_KERNEL_map + p2v_offset; RIP_REL_REF(phys_base) = load_delta; /* Is the address not 2M aligned? */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 56163e2124cf..31345e0ba006 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -94,13 +94,19 @@ SYM_CODE_START_NOALIGN(startup_64) /* Sanitize CPU configuration */ call verify_cpu + /* + * Derive the kernel's physical-to-virtual offset from the physical and + * virtual addresses of common_startup_64(). + */ + leaq common_startup_64(%rip), %rdi + subq .Lcommon_startup_64(%rip), %rdi + /* * Perform pagetable fixups. Additionally, if SME is active, encrypt * the kernel and retrieve the modifier (SME encryption mask if SME * is active) to be added to the initial pgdir entry that will be * programmed into CR3. */ - leaq _text(%rip), %rdi movq %r15, %rsi call __startup_64 @@ -128,11 +134,11 @@ SYM_CODE_START_NOALIGN(startup_64) /* Branch to the common startup code at its kernel virtual address */ ANNOTATE_RETPOLINE_SAFE - jmp *0f(%rip) + jmp *.Lcommon_startup_64(%rip) SYM_CODE_END(startup_64) __INITRODATA -0: .quad common_startup_64 +SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64) .text SYM_CODE_START(secondary_startup_64) From 0d9b9a328cb605419ed046d341dc2a3d66ee0256 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:08 +0100 Subject: [PATCH 061/224] x86/boot/64: Avoid intentional absolute symbol references in .head.text The code in .head.text executes from a 1:1 mapping and cannot generally refer to global variables using their kernel virtual addresses. However, there are some occurrences of such references that are valid: the kernel virtual addresses of _text and _end are needed to populate the page tables correctly, and some other section markers are used in a similar way. To avoid the need for making exceptions to the rule that .head.text must not contain any absolute symbol references, derive these addresses from the RIP-relative 1:1 mapped physical addresses, which can be safely determined using RIP_REL_REF(). Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-12-ardb+git@google.com --- arch/x86/kernel/head64.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a7cd4053eeb3..54f9a8faf212 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -91,9 +91,11 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) { - unsigned long vaddr, vaddr_end; + unsigned long paddr, paddr_end; int i; /* Encrypt the kernel and related (if SME is active) */ @@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * attribute. */ if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; + paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); + paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + for (; paddr < paddr_end; paddr += PMD_SIZE) { /* * On SNP, transition the page to shared in the RMP table so that * it is consistent with the page table attribute change. @@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * mapping (kernel .text). PVALIDATE, by way of * early_snp_set_memory_shared(), requires a valid virtual * address but the kernel is currently running off of the identity - * mapping so use __pa() to get a *currently* valid virtual address. + * mapping so use the PA to get a *currently* valid virtual address. */ - early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - i = pmd_index(vaddr); + i = pmd_index(paddr - p2v_offset); pmd[i] -= sme_get_me_mask(); } } @@ -146,6 +148,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset, { pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); + unsigned long va_text, va_end; unsigned long pgtable_flags; unsigned long load_delta; pgdval_t *pgd; @@ -172,6 +175,9 @@ unsigned long __head __startup_64(unsigned long p2v_offset, if (load_delta & ~PMD_MASK) for (;;); + va_text = physaddr - p2v_offset; + va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; + /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -232,7 +238,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset, pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; - for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { int idx = i + (physaddr >> PMD_SHIFT); pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; @@ -257,11 +263,11 @@ unsigned long __head __startup_64(unsigned long p2v_offset, pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index((unsigned long)_text); i++) + for (i = 0; i < pmd_index(va_text); i++) pmd[i] &= ~_PAGE_PRESENT; /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index((unsigned long)_end); i++) + for (; i <= pmd_index(va_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; @@ -269,7 +275,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset, for (; i < PTRS_PER_PMD; i++) pmd[i] &= ~_PAGE_PRESENT; - return sme_postprocess_startup(bp, pmd); + return sme_postprocess_startup(bp, pmd, p2v_offset); } /* Wipe all early page tables except for the kernel symbol map */ From 3b6f99a94b04b389292590840d96342b7dd08941 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:09 +0100 Subject: [PATCH 062/224] x86/boot: Disable UBSAN in early boot code The early boot code runs from a 1:1 mapping of memory, and may execute before the kernel virtual mapping is even up. This means absolute symbol references cannot be permitted in this code. UBSAN injects references to global data structures into the code, and without -fPIC, those references are emitted as absolute references to kernel virtual addresses. Accessing those will fault before the kernel virtual mapping is up, so UBSAN needs to be disabled in early boot code. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-13-ardb+git@google.com --- arch/x86/coco/sev/shared.c | 7 ++++--- arch/x86/include/asm/init.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/coco/sev/shared.c index afb7ffc355fe..96023bd978cc 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/coco/sev/shared.c @@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int __head +snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 14d72727d7ee..0e82ebc5d1e1 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#define __head __section(".head.text") +#define __head __section(".head.text") __no_sanitize_undefined struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ From 35350eb689e68897d996b762832782e2e791eb74 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:10 +0100 Subject: [PATCH 063/224] x86/kernel: Move ENTRY_TEXT to the start of the image Since commit: 7734a0f31e99 ("x86/boot: Robustify calling startup_{32,64}() from the decompressor code") it is no longer necessary for .head.text to appear at the start of the image. Since ENTRY_TEXT needs to appear PMD-aligned, it is easier to just place it at the start of the image, rather than line it up with the end of the .text section. The amount of padding required should be the same, but this arrangement also permits .head.text to be split off and emitted separately, which is needed by a subsequent change. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-14-ardb+git@google.com --- arch/x86/kernel/vmlinux.lds.S | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fab3ac9a4574..1ce7889cd12b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -121,19 +121,6 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; _stext = .; - /* bootstrapping code */ - HEAD_TEXT - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - SOFTIRQENTRY_TEXT -#ifdef CONFIG_MITIGATION_RETPOLINE - *(.text..__x86.indirect_thunk) - *(.text..__x86.return_thunk) -#endif - STATIC_CALL_TEXT - ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) ENTRY_TEXT @@ -147,6 +134,19 @@ SECTIONS *(.text..__x86.rethunk_safe) #endif ALIGN_ENTRY_TEXT_END + + /* bootstrapping code */ + HEAD_TEXT + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + SOFTIRQENTRY_TEXT +#ifdef CONFIG_MITIGATION_RETPOLINE + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) +#endif + STATIC_CALL_TEXT *(.gnu.warning) } :text = 0xcccccccc From a6a4ae9c3f3a8894c54476cc842069f82af8361c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:11 +0100 Subject: [PATCH 064/224] x86/boot: Move .head.text into its own output section In order to be able to double check that vmlinux is emitted without absolute symbol references in .head.text, it needs to be distinguishable from the rest of .text in the ELF metadata. So move .head.text into its own ELF section. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-15-ardb+git@google.com --- arch/x86/kernel/vmlinux.lds.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ce7889cd12b..56cdf13611e3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -135,8 +135,6 @@ SECTIONS #endif ALIGN_ENTRY_TEXT_END - /* bootstrapping code */ - HEAD_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT @@ -151,6 +149,11 @@ SECTIONS } :text = 0xcccccccc + /* bootstrapping code */ + .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { + HEAD_TEXT + } :text = 0xcccccccc + /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); From faf0ed487415f76fe4acf7980ce360901f5e1698 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 5 Dec 2024 12:28:12 +0100 Subject: [PATCH 065/224] x86/boot: Reject absolute references in .head.text The .head.text section used to contain asm code that bootstrapped the page tables and switched to the kernel virtual address space before executing C code. The asm code carefully avoided dereferencing absolute symbol references, as those will fault before the page tables are installed. Today, the .head.text section contains lots of C code too, and getting the compiler to reason about absolute addresses taken from, e.g., section markers such as _text[] or _end[] but never use such absolute references to access global variables [*] is intractible. So instead, forbid the use of absolute references in .head.text entirely, and rely on explicit arithmetic involving VA-to-PA offsets generated by the asm startup code to construct virtual addresses where needed (e.g., to construct the page tables). Note that the 'relocs' tool is only used on the core kernel image when building a relocatable image, but this is the default, and so adding the check there is sufficient to catch new occurrences of code that use absolute references before the kernel mapping is up. [*] it is feasible when using PIC codegen but there is strong pushback to using this for all of the core kernel, and using it only for .head.text is not straight-forward. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205112804.3416920-16-ardb+git@google.com --- arch/x86/tools/relocs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 27441e5863b2..e937be979ec8 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -841,10 +841,10 @@ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { + int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); - if (sym->st_shndx == SHN_UNDEF) return 0; @@ -900,6 +900,12 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } + if (headtext) { + die("Absolute reference to symbol '%s' not permitted in .head.text\n", + symname); + break; + } + /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when From 63a48181fbcddefe5fb4c6618938bb64c543945b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 3 Dec 2024 11:35:58 -0500 Subject: [PATCH 066/224] smp/scf: Evaluate local cond_func() before IPI side-effects In smp_call_function_many_cond(), the local cond_func() is evaluated after triggering the remote CPU IPIs. If cond_func() depends on loading shared state updated by other CPU's IPI handlers func(), then triggering execution of remote CPUs IPI before evaluating cond_func() may have unexpected consequences. One example scenario is evaluating a jiffies delay in cond_func(), which is updated by func() in the IPI handlers. This situation can prevent execution of periodic cleanup code on the local CPU. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Reviewed-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20241203163558.3455535-1-mathieu.desnoyers@efficios.com --- kernel/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 27dc31a146a3..f104c8e83fc4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -815,7 +815,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ @@ -868,7 +869,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local && (!cond_func || cond_func(this_cpu, info))) { + if (run_local) { unsigned long flags; local_irq_save(flags); From 564ea84c8c14b007d7838bfb1327295b873573be Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 2 Dec 2024 09:24:58 +0200 Subject: [PATCH 067/224] x86/tdx: Dump attributes and TD_CTLS on boot Dump TD configuration on boot. Attributes and TD_CTLS define TD behavior. This information is useful for tracking down bugs. The output ends up looking like this in practice: [ 0.000000] tdx: Guest detected [ 0.000000] tdx: Attributes: SEPT_VE_DISABLE [ 0.000000] tdx: TD_CTLS: PENDING_VE_DISABLE ENUM_TOPOLOGY VIRT_CPUID2 REDUCE_VE Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/all/20241202072458.447455-1-kirill.shutemov%40linux.intel.com --- arch/x86/coco/tdx/Makefile | 2 +- arch/x86/coco/tdx/debug.c | 69 +++++++++++++++++++++++++++++++ arch/x86/coco/tdx/tdx.c | 27 ++++++++---- arch/x86/include/asm/shared/tdx.h | 39 +++++++++++++++-- arch/x86/include/asm/tdx.h | 3 ++ 5 files changed, 128 insertions(+), 12 deletions(-) create mode 100644 arch/x86/coco/tdx/debug.c diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index 2c7dcbf1458b..b3c47d3700e2 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o tdx-shared.o tdcall.o +obj-y += debug.o tdcall.o tdx.o tdx-shared.o diff --git a/arch/x86/coco/tdx/debug.c b/arch/x86/coco/tdx/debug.c new file mode 100644 index 000000000000..cef847c8bb67 --- /dev/null +++ b/arch/x86/coco/tdx/debug.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include +#include +#include + +#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name) + +static __initdata const char *tdx_attributes[] = { + DEF_TDX_ATTR_NAME(DEBUG), + DEF_TDX_ATTR_NAME(HGS_PLUS_PROF), + DEF_TDX_ATTR_NAME(PERF_PROF), + DEF_TDX_ATTR_NAME(PMT_PROF), + DEF_TDX_ATTR_NAME(ICSSD), + DEF_TDX_ATTR_NAME(LASS), + DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE), + DEF_TDX_ATTR_NAME(MIGRTABLE), + DEF_TDX_ATTR_NAME(PKS), + DEF_TDX_ATTR_NAME(KL), + DEF_TDX_ATTR_NAME(TPA), + DEF_TDX_ATTR_NAME(PERFMON), +}; + +#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name) + +static __initdata const char *tdcs_td_ctls[] = { + DEF_TD_CTLS_NAME(PENDING_VE_DISABLE), + DEF_TD_CTLS_NAME(ENUM_TOPOLOGY), + DEF_TD_CTLS_NAME(VIRT_CPUID2), + DEF_TD_CTLS_NAME(REDUCE_VE), + DEF_TD_CTLS_NAME(LOCK), +}; + +void __init tdx_dump_attributes(u64 td_attr) +{ + pr_info("Attributes:"); + + for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) { + if (!tdx_attributes[i]) + continue; + if (td_attr & BIT(i)) + pr_cont(" %s", tdx_attributes[i]); + td_attr &= ~BIT(i); + } + + if (td_attr) + pr_cont(" unknown:%#llx", td_attr); + pr_cont("\n"); + +} + +void __init tdx_dump_td_ctls(u64 td_ctls) +{ + pr_info("TD_CTLS:"); + + for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) { + if (!tdcs_td_ctls[i]) + continue; + if (td_ctls & BIT(i)) + pr_cont(" %s", tdcs_td_ctls[i]); + td_ctls &= ~BIT(i); + } + if (td_ctls) + pr_cont(" unknown:%#llx", td_ctls); + pr_cont("\n"); +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index c0ebe8cc147e..32809a06dab4 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -32,9 +32,6 @@ #define VE_GET_PORT_NUM(e) ((e) >> 16) #define VE_IS_IO_STRING(e) ((e) & BIT(4)) -#define ATTR_DEBUG BIT(0) -#define ATTR_SEPT_VE_DISABLE BIT(28) - /* TDX Module call error codes */ #define TDCALL_RETURN_CODE(a) ((a) >> 32) #define TDCALL_INVALID_OPERAND 0xc0000100 @@ -200,14 +197,14 @@ static void __noreturn tdx_panic(const char *msg) * * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM * controls if the guest will receive such #VE with TD attribute - * ATTR_SEPT_VE_DISABLE. + * TDX_ATTR_SEPT_VE_DISABLE. * * Newer TDX modules allow the guest to control if it wants to receive SEPT * violation #VEs. * * Check if the feature is available and disable SEPT #VE if possible. * - * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE + * If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE * attribute is no longer reliable. It reflects the initial state of the * control for the TD, but it will not be updated if someone (e.g. bootloader) * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to @@ -216,14 +213,14 @@ static void __noreturn tdx_panic(const char *msg) static void disable_sept_ve(u64 td_attr) { const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; - bool debug = td_attr & ATTR_DEBUG; + bool debug = td_attr & TDX_ATTR_DEBUG; u64 config, controls; /* Is this TD allowed to disable SEPT #VE */ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { /* No SEPT #VE controls for the guest: check the attribute */ - if (td_attr & ATTR_SEPT_VE_DISABLE) + if (td_attr & TDX_ATTR_SEPT_VE_DISABLE) return; /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ @@ -1040,6 +1037,20 @@ static void tdx_kexec_finish(void) } } +static __init void tdx_announce(void) +{ + struct tdx_module_args args = {}; + u64 controls; + + pr_info("Guest detected\n"); + + tdcall(TDG_VP_INFO, &args); + tdx_dump_attributes(args.rdx); + + tdg_vm_rd(TDCS_TD_CTLS, &controls); + tdx_dump_td_ctls(controls); +} + void __init tdx_early_init(void) { u64 cc_mask; @@ -1109,5 +1120,5 @@ void __init tdx_early_init(void) */ x86_cpuinit.parallel_bringup = false; - pr_info("Guest detected\n"); + tdx_announce(); } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index a878c7e8347b..fcbbef484a78 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -19,6 +19,32 @@ #define TDG_VM_RD 7 #define TDG_VM_WR 8 +/* TDX attributes */ +#define TDX_ATTR_DEBUG_BIT 0 +#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT) +#define TDX_ATTR_HGS_PLUS_PROF_BIT 4 +#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT) +#define TDX_ATTR_PERF_PROF_BIT 5 +#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT) +#define TDX_ATTR_PMT_PROF_BIT 6 +#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT) +#define TDX_ATTR_ICSSD_BIT 16 +#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT) +#define TDX_ATTR_LASS_BIT 27 +#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT) +#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28 +#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT) +#define TDX_ATTR_MIGRTABLE_BIT 29 +#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT) +#define TDX_ATTR_PKS_BIT 30 +#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT) +#define TDX_ATTR_KL_BIT 31 +#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT) +#define TDX_ATTR_TPA_BIT 62 +#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT) +#define TDX_ATTR_PERFMON_BIT 63 +#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT) + /* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ #define TDCS_CONFIG_FLAGS 0x1110000300000016 #define TDCS_TD_CTLS 0x1110000300000017 @@ -29,9 +55,16 @@ #define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1) /* TDCS_TD_CTLS bits */ -#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0) -#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1) -#define TD_CTLS_REDUCE_VE BIT_ULL(3) +#define TD_CTLS_PENDING_VE_DISABLE_BIT 0 +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT) +#define TD_CTLS_ENUM_TOPOLOGY_BIT 1 +#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT) +#define TD_CTLS_VIRT_CPUID2_BIT 2 +#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT) +#define TD_CTLS_REDUCE_VE_BIT 3 +#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT) +#define TD_CTLS_LOCK_BIT 63 +#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT) /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index eba178996d84..b4b16dafd55e 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -66,6 +66,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); u64 tdx_hcall_get_quote(u8 *buf, size_t size); +void __init tdx_dump_attributes(u64 td_attr); +void __init tdx_dump_td_ctls(u64 td_ctls); + #else static inline void tdx_early_init(void) { }; From 953753db887f9d70f70f61d6ecbe5cf209107672 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Dec 2024 10:46:30 -0500 Subject: [PATCH 068/224] x86/mm/tlb: Also remove local CPU from mm_cpumask if stale The code in flush_tlb_func() that removes a remote CPU from the cpumask if it is no longer running the target mm is also needed on the originating CPU of a TLB flush, now that CPUs are no longer cleared from the mm_cpumask at context switch time. Flushing the TLB when we are not running the target mm is harmless, because the CPU's tlb_gen only gets updated to match the mm_tlb_gen, but it does hit this warning: WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); [ 210.343902][ T4668] WARNING: CPU: 38 PID: 4668 at arch/x86/mm/tlb.c:815 flush_tlb_func (arch/x86/mm/tlb.c:815) Removing both local and remote CPUs from the mm_cpumask when doing a flush for a not currently loaded mm avoids that warning. Reported-by: kernel test robot Tested-by: kernel test robot Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Mathieu Desnoyers Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20241205104630.755706ca@fangorn Closes: https://lore.kernel.org/oe-lkp/202412051551.690e9656-lkp@intel.com --- arch/x86/mm/tlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1aac4fa90d3d..3c30817ec6a2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -756,13 +756,13 @@ static void flush_tlb_func(void *info) if (!local) { inc_irq_stat(irq_tlb_count); count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + } - /* Can only happen on remote CPUs */ - if (f->mm && f->mm != loaded_mm) { - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); - trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); - return; - } + /* The CPU was left in the mm_cpumask of the target mm. Clear it. */ + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); + return; } if (unlikely(loaded_mm == &init_mm)) From 6db2526c1d694c91c6e05e2f186c085e9460f202 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Dec 2024 21:03:16 -0500 Subject: [PATCH 069/224] x86/mm/tlb: Only trim the mm_cpumask once a second Setting and clearing CPU bits in the mm_cpumask is only ever done by the CPU itself, from the context switch code or the TLB flush code. Synchronization is handled by switch_mm_irqs_off() blocking interrupts. Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no longer running the program causes a regression in the will-it-scale tlbflush2 test. This test is contrived, but a large regression here might cause a small regression in some real world workload. Instead of always sending IPIs to CPUs that are in the mm_cpumask, but no longer running the program, send these IPIs only once a second. The rest of the time we can skip over CPUs where the loaded_mm is different from the target mm. Reported-by: kernel test roboto Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ --- arch/x86/include/asm/mmu.h | 2 ++ arch/x86/include/asm/mmu_context.h | 1 + arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++--- 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index ce4677b8b735..3b496cdcb74b 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -37,6 +37,8 @@ typedef struct { */ atomic64_t tlb_gen; + unsigned long next_trim_cpumask; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct rw_semaphore ldt_usr_sem; struct ldt_struct *ldt; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2886cb668d7f..795fdd53bd0a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); + mm->context.next_trim_cpumask = jiffies + HZ; #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 69e79fff41b8..02fc2aa06e9e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -222,6 +222,7 @@ struct flush_tlb_info { unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; + u8 trim_cpumask; }; void flush_tlb_local(void); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3c30817ec6a2..458a5d5be594 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -892,9 +892,36 @@ done: nr_invalidate); } -static bool tlb_is_not_lazy(int cpu, void *data) +static bool should_flush_tlb(int cpu, void *data) { - return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); + struct flush_tlb_info *info = data; + + /* Lazy TLB will get flushed at the next context switch. */ + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + return false; + + /* No mm means kernel memory flush. */ + if (!info->mm) + return true; + + /* The target mm is loaded, and the CPU is not lazy. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + return true; + + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ + if (info->trim_cpumask) + return true; + + return false; +} + +static bool should_trim_cpumask(struct mm_struct *mm) +{ + if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) { + WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ); + return true; + } + return false; } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); @@ -928,7 +955,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, if (info->freed_tables) on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, (void *)info, 1, cpumask); } @@ -979,6 +1006,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, info->freed_tables = freed_tables; info->new_tlb_gen = new_tlb_gen; info->initiating_cpu = smp_processor_id(); + info->trim_cpumask = 0; return info; } @@ -1021,6 +1049,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, * flush_tlb_func_local() directly in this case. */ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); flush_tlb_multi(mm_cpumask(mm), info); } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); From 207bdf7f72ae8b1764de294ae59bdf5b015082bd Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:08 +0000 Subject: [PATCH 070/224] x86/kexec: Clean up and document register use in relocate_kernel_64.S Add more comments explaining what each register contains, and save the preserve_context flag to a non-clobbered register sooner, to keep things simpler. No change in behavior intended. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Acked-by: Kai Huang Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-3-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 1236f25fc8d1..92478e2e254f 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -100,6 +100,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movq %r10, CP_PA_SWAP_PAGE(%r11) movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + movq %rcx, %r11 + /* Switch to the identity mapped page tables */ movq %r9, %cr3 @@ -116,6 +119,14 @@ SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) UNWIND_HINT_END_OF_STACK + /* + * %rdi indirection page + * %rdx start address + * %r11 preserve_context + * %r12 host_mem_enc_active + * %r13 original CR4 when relocate_kernel() was invoked + */ + /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -170,8 +181,6 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) wbinvd .Lsme_off: - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ - movq %rcx, %r11 call swap_pages /* @@ -183,13 +192,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 + testq %r11, %r11 /* preserve_context */ + jnz .Lrelocate + /* * set all of the registers to known values * leave %rsp alone */ - testq %r11, %r11 - jnz .Lrelocate xorl %eax, %eax xorl %ebx, %ebx xorl %ecx, %ecx From 46d4e205e22c89841552b05663d34e57e9a66611 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:09 +0000 Subject: [PATCH 071/224] x86/kexec: Use named labels in swap_pages in relocate_kernel_64.S Make the code a little more readable. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Acked-by: Kai Huang Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-4-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 92478e2e254f..fea650f92606 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -279,31 +279,31 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi xorl %esi, %esi - jmp 1f + jmp .Lstart /* Should start with an indirection record */ -0: /* top, read another word for the indirection page */ +.Lloop: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx -1: +.Lstart: testb $0x1, %cl /* is it a destination page? */ - jz 2f + jz .Lnotdest movq %rcx, %rdi andq $0xfffffffffffff000, %rdi - jmp 0b -2: + jmp .Lloop +.Lnotdest: testb $0x2, %cl /* is it an indirection page? */ - jz 2f + jz .Lnotind movq %rcx, %rbx andq $0xfffffffffffff000, %rbx - jmp 0b -2: + jmp .Lloop +.Lnotind: testb $0x4, %cl /* is it the done indicator? */ - jz 2f - jmp 3f -2: + jz .Lnotdone + jmp .Ldone +.Lnotdone: testb $0x8, %cl /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ + jz .Lloop /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi @@ -328,8 +328,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) rep ; movsq lea PAGE_SIZE(%rax), %rsi - jmp 0b -3: + jmp .Lloop +.Ldone: ANNOTATE_UNRET_SAFE ret int3 From 9e5683e2d0b5584c51993908c5d0afa78e613492 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:10 +0000 Subject: [PATCH 072/224] x86/kexec: Only swap pages for ::preserve_context mode There's no need to swap pages (which involves three memcopies for each page) in the plain kexec case. Just do a single copy from source to destination page. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-5-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index fea650f92606..ca7f1e1d5b11 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -310,6 +310,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rdx /* Save destination page to %rdx */ movq %rsi, %rax /* Save source page to %rax */ + testq %r11, %r11 /* Only actually swap for ::preserve_context */ + jz .Lnoswap + /* copy source page to swap page */ movq %r10, %rdi movl $512, %ecx @@ -324,6 +327,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) /* copy swap page to destination page */ movq %rdx, %rdi movq %r10, %rsi +.Lnoswap: movl $512, %ecx rep ; movsq From 4b5bc2ec9a239bce261ffeafdd63571134102323 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:11 +0000 Subject: [PATCH 073/224] x86/kexec: Allocate PGD for x86_64 transition page tables separately Now that the following fix: d0ceea662d45 ("x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables") stops kernel_ident_mapping_init() from scribbling over the end of a 4KiB PGD by assuming the following 4KiB will be a userspace PGD, there's no good reason for the kexec PGD to be part of a single 8KiB allocation with the control_code_page. ( It's not clear that that was the reason for x86_64 kexec doing it that way in the first place either; there were no comments to that effect and it seems to have been the case even before PTI came along. It looks like it was just a happy accident which prevented memory corruption on kexec. ) Either way, it definitely isn't needed now. Just allocate the PGD separately on x86_64, like i386 already does. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-6-dwmw2@infradead.org --- arch/x86/include/asm/kexec.h | 18 +++++++++--- arch/x86/kernel/machine_kexec_64.c | 45 ++++++++++++++++-------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index ae5482a2f0ca..ccb8ff37fa9d 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -16,6 +16,7 @@ # define PAGES_NR 4 #endif +# define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 #ifndef __ASSEMBLY__ @@ -43,7 +44,6 @@ struct kimage; /* Maximum address we can use for the control code buffer */ # define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -# define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_386 @@ -58,9 +58,6 @@ struct kimage; /* Maximum address we can use for the control pages */ # define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) -/* Allocate one page for the pdp and the second for the code */ -# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) - /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif @@ -145,6 +142,19 @@ struct kimage_arch { }; #else struct kimage_arch { + /* + * This is a kimage control page, as it must not overlap with either + * source or destination address ranges. + */ + pgd_t *pgd; + /* + * The virtual mapping of the control code page itself is used only + * during the transition, while the current kernel's pages are all + * in place. Thus the intermediate page table pages used to map it + * are not control pages, but instead just normal pages obtained + * with get_zeroed_page(). And have to be tracked (below) so that + * they can be freed. + */ p4d_t *p4d; pud_t *pud; pmd_t *pmd; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9c9ac606893e..7223c38a8708 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image) image->arch.pte = NULL; } -static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, + unsigned long control_page) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; @@ -157,7 +158,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) pte_t *pte; vaddr = (unsigned long)relocate_kernel; - paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + paddr = control_page; pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); @@ -216,7 +217,7 @@ static void *alloc_pgt_page(void *data) return p; } -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +static int init_pgtable(struct kimage *image, unsigned long control_page) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, @@ -225,12 +226,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; - pgd_t *level4p; int result; int i; - level4p = (pgd_t *)__va(start_pgtable); - clear_page(level4p); + image->arch.pgd = alloc_pgt_page(image); + if (!image->arch.pgd) + return -ENOMEM; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; @@ -244,8 +245,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; } @@ -260,8 +261,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; @@ -271,15 +272,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ - result = map_efi_systab(&info, level4p); + result = map_efi_systab(&info, image->arch.pgd); if (result) return result; - result = map_acpi_tables(&info, level4p); + result = map_acpi_tables(&info, image->arch.pgd); if (result) return result; - return init_transition_pgtable(image, level4p); + /* + * This must be last because the intermediate page table pages it + * allocates will not be control pages and may overlap the image. + */ + return init_transition_pgtable(image, image->arch.pgd, control_page); } static void load_segments(void) @@ -296,14 +301,14 @@ static void load_segments(void) int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable; + unsigned long control_page; int result; /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT; /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); + result = init_pgtable(image, control_page); if (result) return result; @@ -357,13 +362,12 @@ void machine_kexec(struct kimage *image) #endif } - control_page = page_address(image->control_code_page) + PAGE_SIZE; + control_page = page_address(image->control_code_page); __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_TABLE_PAGE] = - (unsigned long)__pa(page_address(image->control_code_page)); + page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) @@ -573,8 +577,7 @@ static void kexec_mark_crashkres(bool protect) /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); - /* Control code page is located in the 2nd page. */ - kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + kexec_mark_range(crashk_res.start, control - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } From 6a750b4c009936f352aaac0366f5f10fcf51e81b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:12 +0000 Subject: [PATCH 074/224] x86/kexec: Copy control page into place in machine_kexec_prepare() There's no need for this to wait until the actual machine_kexec() invocation; future changes will need to make the control page read-only and executable, so all writes should be completed before machine_kexec_prepare() returns. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-7-dwmw2@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7223c38a8708..3a4cbac1a0c6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -301,17 +301,16 @@ static void load_segments(void) int machine_kexec_prepare(struct kimage *image) { - unsigned long control_page; + void *control_page = page_address(image->control_code_page); int result; - /* Calculate the offsets */ - control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, control_page); + result = init_pgtable(image, __pa(control_page)); if (result) return result; + __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + return 0; } @@ -363,7 +362,6 @@ void machine_kexec(struct kimage *image) } control_page = page_address(image->control_code_page); - __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; From eeebbde57113730db7b3ec7380ada61a0193d27c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:13 +0000 Subject: [PATCH 075/224] x86/kexec: Invoke copy of relocate_kernel() instead of the original This currently calls set_memory_x() from machine_kexec_prepare() just like the 32-bit version does. That's actually a bit earlier than I'd like, as it leaves the page RWX all the time the image is even *loaded*. Subsequent commits will eliminate all the writes to the page between the point it's marked executable in machine_kexec_prepare() the time that relocate_kernel() is running and has switched to the identmap %cr3, so that it can be ROX. But that can't happen until it's moved to the .data section of the kernel, and *that* can't happen until we start executing the copy instead of executing it in place in the kernel .text. So break the circular dependency in those commits by letting it be RWX for now. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-8-dwmw2@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 30 ++++++++++++++++++++++------ arch/x86/kernel/relocate_kernel_64.S | 5 ++++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 3a4cbac1a0c6..9567347f7a9b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -157,7 +157,12 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, pmd_t *pmd; pte_t *pte; - vaddr = (unsigned long)relocate_kernel; + /* + * For the transition to the identity mapped page tables, the control + * code page also needs to be mapped at the virtual address it starts + * off running from. + */ + vaddr = (unsigned long)__va(control_page); paddr = control_page; pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { @@ -311,11 +316,17 @@ int machine_kexec_prepare(struct kimage *image) __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + set_memory_x((unsigned long)control_page, 1); + return 0; } void machine_kexec_cleanup(struct kimage *image) { + void *control_page = page_address(image->control_code_page); + + set_memory_nx((unsigned long)control_page, 1); + free_transition_pgtable(image); } @@ -325,6 +336,11 @@ void machine_kexec_cleanup(struct kimage *image) */ void machine_kexec(struct kimage *image) { + unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address, + unsigned int preserve_context, + unsigned int host_mem_enc_active); unsigned long page_list[PAGES_NR]; unsigned int host_mem_enc_active; int save_ftrace_enabled; @@ -371,6 +387,8 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); + relocate_kernel_ptr = control_page; + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -390,11 +408,11 @@ void machine_kexec(struct kimage *image) native_gdt_invalidate(); /* now call it */ - image->start = relocate_kernel((unsigned long)image->head, - (unsigned long)page_list, - image->start, - image->preserve_context, - host_mem_enc_active); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context, + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ca7f1e1d5b11..d0a87b39db6a 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -39,6 +39,7 @@ #define CP_PA_TABLE_PAGE DATA(0x20) #define CP_PA_SWAP_PAGE DATA(0x28) #define CP_PA_BACKUP_PAGES_MAP DATA(0x30) +#define CP_VA_CONTROL_PAGE DATA(0x38) .text .align PAGE_SIZE @@ -99,6 +100,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movq %r9, CP_PA_TABLE_PAGE(%r11) movq %r10, CP_PA_SWAP_PAGE(%r11) movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + movq %r11, CP_VA_CONTROL_PAGE(%r11) /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 @@ -235,7 +237,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %rax, %cr3 lea PAGE_SIZE(%r8), %rsp call swap_pages - movq $virtual_mapped, %rax + movq CP_VA_CONTROL_PAGE(%r8), %rax + addq $(virtual_mapped - relocate_kernel), %rax pushq %rax ANNOTATE_UNRET_SAFE ret From cb33ff9e063c1230d557d97ff6e87d097821d517 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:14 +0000 Subject: [PATCH 076/224] x86/kexec: Move relocate_kernel to kernel .data section Now that the copy is executed instead of the original, the relocate_kernel page can live in the kernel's .text section. This will allow subsequent commits to actually add real data to it and clean up the code somewhat as well as making the control page ROX. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-9-dwmw2@infradead.org --- arch/x86/include/asm/sections.h | 1 + arch/x86/kernel/callthunks.c | 6 ++++++ arch/x86/kernel/machine_kexec_64.c | 4 +++- arch/x86/kernel/relocate_kernel_64.S | 7 +------ arch/x86/kernel/vmlinux.lds.S | 13 +++++++++++++ 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 3fa87e5e11ab..30e8ee7006f9 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -5,6 +5,7 @@ #include #include +extern char __relocate_kernel_start[], __relocate_kernel_end[]; extern char __brk_base[], __brk_limit[]; extern char __end_rodata_aligned[]; diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 465647456753..51c3e0049152 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -139,9 +139,15 @@ static bool skip_addr(void *dest) return true; #endif #ifdef CONFIG_KEXEC_CORE +# ifdef CONFIG_X86_64 + if (dest >= (void *)__relocate_kernel_start && + dest < (void *)__relocate_kernel_end) + return true; +# else if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; +# endif #endif #ifdef CONFIG_XEN if (dest >= (void *)hypercall_page && diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9567347f7a9b..23dffdc070dd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -307,6 +307,8 @@ static void load_segments(void) int machine_kexec_prepare(struct kimage *image) { void *control_page = page_address(image->control_code_page); + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; /* Setup the identity mapped 64bit page table */ @@ -314,7 +316,7 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; - __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); set_memory_x((unsigned long)control_page, 1); diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d0a87b39db6a..267004441665 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -41,10 +41,8 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x30) #define CP_VA_CONTROL_PAGE DATA(0x38) - .text - .align PAGE_SIZE + .section .text.relocate_kernel,"ax"; .code64 -SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR @@ -341,6 +339,3 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) ret int3 SYM_CODE_END(swap_pages) - - .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc -SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 56cdf13611e3..78ce1a0a408f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ #include #include #include +#include #undef i386 /* in case the preprocessor is a 32bit one */ @@ -95,7 +96,18 @@ const_pcpu_hot = pcpu_hot; #define BSS_DECRYPTED #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE) +#define KEXEC_RELOCATE_KERNEL \ + . = ALIGN(0x100); \ + __relocate_kernel_start = .; \ + *(.text.relocate_kernel); \ + __relocate_kernel_end = .; +ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE, + "relocate_kernel code too large!") +#else +#define KEXEC_RELOCATE_KERNEL +#endif PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ @@ -184,6 +196,7 @@ SECTIONS DATA_DATA CONSTRUCTORS + KEXEC_RELOCATE_KERNEL /* rarely changed data like cpu maps */ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) From 8dbec5c77bc32f04583d3973c8178a74e72fdf18 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:15 +0000 Subject: [PATCH 077/224] x86/kexec: Add data section to relocate_kernel Now that the relocate_kernel page is handled sanely by a linker script we can have actual data, and just use %rip-relative addressing to access it. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-10-dwmw2@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 8 +++- arch/x86/kernel/relocate_kernel_64.S | 62 ++++++++++++++-------------- arch/x86/kernel/vmlinux.lds.S | 1 + 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 23dffdc070dd..63dca5c595f6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -343,6 +343,7 @@ void machine_kexec(struct kimage *image) unsigned long start_address, unsigned int preserve_context, unsigned int host_mem_enc_active); + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; unsigned long page_list[PAGES_NR]; unsigned int host_mem_enc_active; int save_ftrace_enabled; @@ -389,7 +390,12 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - relocate_kernel_ptr = control_page; + /* + * Allow for the possibility that relocate_kernel might not be at + * the very start of the page. + */ + relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - + reloc_start; /* * The segment registers are funny things, they have both a diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 267004441665..f13866a068b0 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -23,23 +23,21 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) /* - * control_page + KEXEC_CONTROL_CODE_MAX_SIZE - * ~ control_page + PAGE_SIZE are used as data storage and stack for - * jumping back + * The .text.relocate_kernel and .data.relocate_kernel sections are copied + * into the control page, and the remainder of the page is used as the stack. */ -#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + .section .data.relocate_kernel,"a"; /* Minimal CPU state */ -#define RSP DATA(0x0) -#define CR0 DATA(0x8) -#define CR3 DATA(0x10) -#define CR4 DATA(0x18) - -/* other data */ -#define CP_PA_TABLE_PAGE DATA(0x20) -#define CP_PA_SWAP_PAGE DATA(0x28) -#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) -#define CP_VA_CONTROL_PAGE DATA(0x38) +SYM_DATA_LOCAL(saved_rsp, .quad 0) +SYM_DATA_LOCAL(saved_cr0, .quad 0) +SYM_DATA_LOCAL(saved_cr3, .quad 0) +SYM_DATA_LOCAL(saved_cr4, .quad 0) + /* other data */ +SYM_DATA_LOCAL(va_control_page, .quad 0) +SYM_DATA_LOCAL(pa_table_page, .quad 0) +SYM_DATA_LOCAL(pa_swap_page, .quad 0) +SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) .section .text.relocate_kernel,"ax"; .code64 @@ -63,14 +61,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 - movq %rsp, RSP(%r11) + movq %rsp, saved_rsp(%rip) movq %cr0, %rax - movq %rax, CR0(%r11) + movq %rax, saved_cr0(%rip) movq %cr3, %rax - movq %rax, CR3(%r11) + movq %rax, saved_cr3(%rip) movq %cr4, %rax - movq %rax, CR4(%r11) + movq %rax, saved_cr4(%rip) /* Save CR4. Required to enable the right paging mode later. */ movq %rax, %r13 @@ -83,10 +80,11 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movq %r8, %r12 /* - * get physical address of control page now + * get physical and virtual address of control page now * this is impossible after page table switch */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* get physical address of page table now too */ movq PTR(PA_TABLE_PAGE)(%rsi), %r9 @@ -95,10 +93,10 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movq PTR(PA_SWAP_PAGE)(%rsi), %r10 /* save some information for jumping back */ - movq %r9, CP_PA_TABLE_PAGE(%r11) - movq %r10, CP_PA_SWAP_PAGE(%r11) - movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) - movq %r11, CP_VA_CONTROL_PAGE(%r11) + movq %r9, pa_table_page(%rip) + movq %r10, pa_swap_page(%rip) + movq %rdi, pa_backup_pages_map(%rip) + movq %r11, va_control_page(%rip) /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 @@ -229,13 +227,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp leaq relocate_kernel(%rip), %r8 - movq CP_PA_SWAP_PAGE(%r8), %r10 - movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi - movq CP_PA_TABLE_PAGE(%r8), %rax + movq pa_swap_page(%rip), %r10 + movq pa_backup_pages_map(%rip), %rdi + movq pa_table_page(%rip), %rax movq %rax, %cr3 lea PAGE_SIZE(%r8), %rsp call swap_pages - movq CP_VA_CONTROL_PAGE(%r8), %rax + movq va_control_page(%rip), %rax addq $(virtual_mapped - relocate_kernel), %rax pushq %rax ANNOTATE_UNRET_SAFE @@ -246,11 +244,11 @@ SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // RET target, above - movq RSP(%r8), %rsp - movq CR4(%r8), %rax + movq saved_rsp(%rip), %rsp + movq saved_cr4(%rip), %rax movq %rax, %cr4 - movq CR3(%r8), %rax - movq CR0(%r8), %r8 + movq saved_cr3(%rip), %rax + movq saved_cr0(%rip), %r8 movq %rax, %cr3 movq %r8, %cr0 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 78ce1a0a408f..0c893997f023 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -101,6 +101,7 @@ const_pcpu_hot = pcpu_hot; . = ALIGN(0x100); \ __relocate_kernel_start = .; \ *(.text.relocate_kernel); \ + *(.data.relocate_kernel); \ __relocate_kernel_end = .; ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE, From b3adabae8a96fee62184f4236bf60313b35244e9 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:16 +0000 Subject: [PATCH 078/224] x86/kexec: Drop page_list argument from relocate_kernel() The kernel's virtual mapping of the relocate_kernel page currently needs to be RWX because it is written to before the %cr3 switch. Now that the relocate_kernel page has its own .data section and local variables, it can also have *global* variables. So eliminate the separate page_list argument, and write the same information directly to variables in the relocate_kernel page instead. This way, the relocate_kernel code itself doesn't need to copy it. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-11-dwmw2@infradead.org --- arch/x86/include/asm/kexec.h | 12 ++++------ arch/x86/kernel/machine_kexec_64.c | 18 ++++++-------- arch/x86/kernel/relocate_kernel_64.S | 36 ++++++++++------------------ 3 files changed, 24 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index ccb8ff37fa9d..48e4f44f794f 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -8,12 +8,6 @@ # define PA_PGD 2 # define PA_SWAP_PAGE 3 # define PAGES_NR 4 -#else -# define PA_CONTROL_PAGE 0 -# define VA_CONTROL_PAGE 1 -# define PA_TABLE_PAGE 2 -# define PA_SWAP_PAGE 3 -# define PAGES_NR 4 #endif # define KEXEC_CONTROL_PAGE_SIZE 4096 @@ -60,6 +54,10 @@ struct kimage; /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 + +extern unsigned long kexec_va_control_page; +extern unsigned long kexec_pa_table_page; +extern unsigned long kexec_pa_swap_page; #endif /* @@ -122,7 +120,7 @@ relocate_kernel(unsigned long indirection_page, #else unsigned long relocate_kernel(unsigned long indirection_page, - unsigned long page_list, + unsigned long pa_control_page, unsigned long start_address, unsigned int preserve_context, unsigned int host_mem_enc_active); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 63dca5c595f6..c9fd60f8f806 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -315,6 +315,11 @@ int machine_kexec_prepare(struct kimage *image) result = init_pgtable(image, __pa(control_page)); if (result) return result; + kexec_va_control_page = (unsigned long)control_page; + kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); @@ -339,12 +344,11 @@ void machine_kexec_cleanup(struct kimage *image) void machine_kexec(struct kimage *image) { unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page, - unsigned long page_list, + unsigned long pa_control_page, unsigned long start_address, unsigned int preserve_context, unsigned int host_mem_enc_active); unsigned long reloc_start = (unsigned long)__relocate_kernel_start; - unsigned long page_list[PAGES_NR]; unsigned int host_mem_enc_active; int save_ftrace_enabled; void *control_page; @@ -382,14 +386,6 @@ void machine_kexec(struct kimage *image) control_page = page_address(image->control_code_page); - page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd); - - if (image->type == KEXEC_TYPE_DEFAULT) - page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) - << PAGE_SHIFT); - /* * Allow for the possibility that relocate_kernel might not be at * the very start of the page. @@ -417,7 +413,7 @@ void machine_kexec(struct kimage *image) /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, - (unsigned long)page_list, + virt_to_phys(control_page), image->start, image->preserve_context, host_mem_enc_active); diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index f13866a068b0..d52c3bb25b5e 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -34,9 +34,9 @@ SYM_DATA_LOCAL(saved_cr0, .quad 0) SYM_DATA_LOCAL(saved_cr3, .quad 0) SYM_DATA_LOCAL(saved_cr4, .quad 0) /* other data */ -SYM_DATA_LOCAL(va_control_page, .quad 0) -SYM_DATA_LOCAL(pa_table_page, .quad 0) -SYM_DATA_LOCAL(pa_swap_page, .quad 0) +SYM_DATA(kexec_va_control_page, .quad 0) +SYM_DATA(kexec_pa_table_page, .quad 0) +SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) .section .text.relocate_kernel,"ax"; @@ -46,7 +46,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) ANNOTATE_NOENDBR /* * %rdi indirection_page - * %rsi page_list + * %rsi pa_control_page * %rdx start address * %rcx preserve_context * %r8 host_mem_enc_active @@ -79,31 +79,19 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* Save SME active flag */ movq %r8, %r12 - /* - * get physical and virtual address of control page now - * this is impossible after page table switch - */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 - - /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %r9 - - /* get physical address of swap page now */ - movq PTR(PA_SWAP_PAGE)(%rsi), %r10 - - /* save some information for jumping back */ - movq %r9, pa_table_page(%rip) - movq %r10, pa_swap_page(%rip) + /* save indirection list for jumping back */ movq %rdi, pa_backup_pages_map(%rip) - movq %r11, va_control_page(%rip) /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 /* Switch to the identity mapped page tables */ + movq kexec_pa_table_page(%rip), %r9 movq %r9, %cr3 + /* Physical address of control page */ + movq %rsi, %r8 + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp @@ -227,13 +215,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp leaq relocate_kernel(%rip), %r8 - movq pa_swap_page(%rip), %r10 + movq kexec_pa_swap_page(%rip), %r10 movq pa_backup_pages_map(%rip), %rdi - movq pa_table_page(%rip), %rax + movq kexec_pa_table_page(%rip), %rax movq %rax, %cr3 lea PAGE_SIZE(%r8), %rsp call swap_pages - movq va_control_page(%rip), %rax + movq kexec_va_control_page(%rip), %rax addq $(virtual_mapped - relocate_kernel), %rax pushq %rax ANNOTATE_UNRET_SAFE From b7155dfd4999211247cce40be2665c71235ab094 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:17 +0000 Subject: [PATCH 079/224] x86/kexec: Eliminate writes through kernel mapping of relocate_kernel page All writes to the relocate_kernel control page are now done *after* the %cr3 switch via simple %rip-relative addressing, which means the DATA() macro with its pointer arithmetic can also now be removed. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-12-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 29 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index d52c3bb25b5e..739041c5bca3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -61,21 +61,24 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - movq %rsp, saved_rsp(%rip) - movq %cr0, %rax - movq %rax, saved_cr0(%rip) - movq %cr3, %rax - movq %rax, saved_cr3(%rip) - movq %cr4, %rax - movq %rax, saved_cr4(%rip) - - /* Save CR4. Required to enable the right paging mode later. */ - movq %rax, %r13 - /* zero out flags, and disable interrupts */ pushq $0 popfq + /* Switch to the identity mapped page tables */ + movq %cr3, %rax + movq kexec_pa_table_page(%rip), %r9 + movq %r9, %cr3 + + /* Save %rsp and CRs. */ + movq %rsp, saved_rsp(%rip) + movq %rax, saved_cr3(%rip) + movq %cr0, %rax + movq %rax, saved_cr0(%rip) + /* Leave CR4 in %r13 to enable the right paging mode later. */ + movq %cr4, %r13 + movq %r13, saved_cr4(%rip) + /* Save SME active flag */ movq %r8, %r12 @@ -85,10 +88,6 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 - /* Switch to the identity mapped page tables */ - movq kexec_pa_table_page(%rip), %r9 - movq %r9, %cr3 - /* Physical address of control page */ movq %rsi, %r8 From 93e489ad7a4694bb2fe8110f5012f85bd3eee65a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:18 +0000 Subject: [PATCH 080/224] x86/kexec: Clean up register usage in relocate_kernel() The memory encryption flag is passed in %r8 because that's where the calling convention puts it. Instead of moving it to %r12 and then using %r8 for other things, just leave it in %r8 and use other registers instead. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20241205153343.3275139-13-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 739041c5bca3..8bc86a1e056a 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -79,24 +79,18 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movq %cr4, %r13 movq %r13, saved_cr4(%rip) - /* Save SME active flag */ - movq %r8, %r12 - /* save indirection list for jumping back */ movq %rdi, pa_backup_pages_map(%rip) /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 - /* Physical address of control page */ - movq %rsi, %r8 - /* setup a new stack at the end of the physical control page */ - lea PAGE_SIZE(%r8), %rsp + lea PAGE_SIZE(%rsi), %rsp /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %r8 - pushq %r8 + addq $(identity_mapped - relocate_kernel), %rsi + pushq %rsi ANNOTATE_UNRET_SAFE ret int3 @@ -107,8 +101,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* * %rdi indirection page * %rdx start address + * %r8 host_mem_enc_active + * %r9 page table page * %r11 preserve_context - * %r12 host_mem_enc_active * %r13 original CR4 when relocate_kernel() was invoked */ @@ -161,7 +156,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. */ - testq %r12, %r12 + testq %r8, %r8 jz .Lsme_off wbinvd .Lsme_off: From 5a82223e0743fb36bcb99657772513739d1a9936 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 Dec 2024 15:05:19 +0000 Subject: [PATCH 081/224] x86/kexec: Mark relocate_kernel page as ROX instead of RWX All writes to the page now happen before it gets marked as executable (or after it's already switched to the identmap page tables where it's OK to be RWX). Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Cc: Eric Biederman Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20241205153343.3275139-14-dwmw2@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c9fd60f8f806..9232ad1562c8 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -323,7 +323,7 @@ int machine_kexec_prepare(struct kimage *image) __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); - set_memory_x((unsigned long)control_page, 1); + set_memory_rox((unsigned long)control_page, 1); return 0; } @@ -333,6 +333,7 @@ void machine_kexec_cleanup(struct kimage *image) void *control_page = page_address(image->control_code_page); set_memory_nx((unsigned long)control_page, 1); + set_memory_rw((unsigned long)control_page, 1); free_transition_pgtable(image); } From 7a470e826d7521bec6af789deab31cfa4fd05af3 Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Thu, 7 Nov 2024 23:30:00 +0000 Subject: [PATCH 082/224] x86/cpufeatures: Free up unused feature bits Linux defined feature bits X86_FEATURE_P3 and X86_FEATURE_P4 are not used anywhere. Commit f31d731e4467 ("x86: use X86_FEATURE_NOPL in alternatives") got rid of the last usage in 2008. Remove the related mappings and code. Just like all X86_FEATURE bits, the raw bit numbers can be exposed to userspace via MODULE_DEVICE_TABLE(). There is a very small theoretical chance of userspace getting confused if these bits got reassigned and changed logical meaning. But these bits were never used for a device table, so it's highly unlikely this will ever happen in practice. [ dhansen: clarify userspace visibility of these bits ] Signed-off-by: Sohil Mehta Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/20241107233000.2742619-1-sohil.mehta%40intel.com --- arch/x86/include/asm/cpufeatures.h | 4 ++-- arch/x86/kernel/cpu/intel.c | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..f725ccc77b01 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -83,8 +83,8 @@ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ #define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ #define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */ +/* Free ( 3*32+ 6) */ +/* Free ( 3*32+ 7) */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ #define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ #define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d1de300af173..5a9fbe962135 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -597,11 +597,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); } - - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); #endif /* Work around errata */ From 29188c16006176caee6cb6729103be51a29c1a93 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 3 Dec 2024 08:15:50 +0100 Subject: [PATCH 083/224] x86/paravirt: Remove the WBINVD callback The pv_ops::cpu.wbinvd paravirt callback is a leftover of lguest times. Today it is no longer needed, as all users use the native WBINVD implementation. Remove the callback and rename native_wbinvd() to wbinvd(). Signed-off-by: Juergen Gross Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241203071550.26487-1-jgross@suse.com --- arch/x86/include/asm/paravirt.h | 7 ------- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/special_insns.h | 8 +------- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- arch/x86/kernel/paravirt.c | 6 ------ arch/x86/kernel/process.c | 4 ++-- arch/x86/xen/enlighten_pv.c | 2 -- 7 files changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d4eb9e1d61b8..041aff51eb50 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -180,13 +180,6 @@ static inline void halt(void) PVOP_VCALL0(irq.halt); } -extern noinstr void pv_native_wbinvd(void); - -static __always_inline void wbinvd(void) -{ - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); -} - static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8d4fbe1be489..fea56b04f436 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -86,8 +86,6 @@ struct pv_cpu_ops { void (*update_io_bitmap)(void); #endif - void (*wbinvd)(void); - /* cpuid emulation, mostly so that caps bits can be disabled */ void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aec6e2d3aa1d..fab7c8af27a4 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru) } #endif -static __always_inline void native_wbinvd(void) +static __always_inline void wbinvd(void) { asm volatile("wbinvd": : :"memory"); } @@ -167,12 +167,6 @@ static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } - -static __always_inline void wbinvd(void) -{ - native_wbinvd(); -} - #endif /* CONFIG_PARAVIRT_XXL */ static __always_inline void clflush(volatile void *__p) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 972e6b6b0481..b72f7e91387e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -459,7 +459,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * increase likelihood that allocated cache portion will be filled * with associated memory. */ - native_wbinvd(); + wbinvd(); /* * Always called with interrupts enabled. By disabling interrupts diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index fec381533555..927e33e6843a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -116,11 +116,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) native_set_debugreg(regno, val); } -noinstr void pv_native_wbinvd(void) -{ - native_wbinvd(); -} - static noinstr void pv_native_safe_halt(void) { native_safe_halt(); @@ -148,7 +143,6 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, - .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f63f8fd00a91..58ead05a1c29 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -825,7 +825,7 @@ void __noreturn stop_this_cpu(void *dummy) * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) - native_wbinvd(); + wbinvd(); /* * This brings a cache line back and dirties it, but @@ -846,7 +846,7 @@ void __noreturn stop_this_cpu(void *dummy) /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the - * native_wbinvd() above. + * wbinvd() above. */ native_halt(); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda..fd2169063480 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1161,8 +1161,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, - .wbinvd = pv_native_wbinvd, - .read_msr = xen_read_msr, .write_msr = xen_write_msr, From a3eaa2be7004ed7ce5cf8939c660e44a15fc3665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 2 Dec 2024 20:43:24 +0100 Subject: [PATCH 084/224] x86/sysfs: Constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241202-sysfs-const-bin_attr-x86-v1-1-b767d5f0ac5c@weissschuh.net --- arch/x86/kernel/ksysfs.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 257892fcefa7..b68d4be9464e 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj, static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { memcpy(buf, (void *)&boot_params + off, count); return count; } -static struct bin_attribute boot_params_data_attr = { +static const struct bin_attribute boot_params_data_attr = { .attr = { .name = "data", .mode = S_IRUGO, }, - .read = boot_params_data_read, + .read_new = boot_params_data_read, .size = sizeof(boot_params), }; @@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = { NULL, }; -static struct bin_attribute *boot_params_data_attrs[] = { +static const struct bin_attribute *const boot_params_data_attrs[] = { &boot_params_data_attr, NULL, }; static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, - .bin_attrs = boot_params_data_attrs, + .bin_attrs_new = boot_params_data_attrs, }; static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) @@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj, static ssize_t setup_data_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = { .name = "data", .mode = S_IRUGO, }, - .read = setup_data_data_read, + .read_new = setup_data_data_read, }; static struct attribute *setup_data_type_attrs[] = { @@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = { NULL, }; -static struct bin_attribute *setup_data_data_attrs[] = { +static const struct bin_attribute *const setup_data_data_attrs[] = { &data_attr, NULL, }; static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, - .bin_attrs = setup_data_data_attrs, + .bin_attrs_new = setup_data_data_attrs, }; static int __init create_setup_data_node(struct kobject *parent, From 9d93db0d1881c9e37e1528cd796e20ff13b7692c Mon Sep 17 00:00:00 2001 From: Gautam Somani Date: Sun, 1 Dec 2024 03:41:02 +0900 Subject: [PATCH 085/224] x86/mm/selftests: Fix typo in lam.c Change the spelling from metadate -> metadata Signed-off-by: Gautam Somani Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241130184102.2182-1-gautamsomani@gmail.com --- tools/testing/selftests/x86/lam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 0ea4f6813930..4d4a76532dc9 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -237,7 +237,7 @@ static uint64_t set_metadata(uint64_t src, unsigned long lam) * both pointers should point to the same address. * * @return: - * 0: value on the pointer with metadate and value on original are same + * 0: value on the pointer with metadata and value on original are same * 1: not same. */ static int handle_lam_test(void *src, unsigned int lam) From dd4059634dab548c904eeae2660ba3c8f7ce843c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 2 Dec 2024 09:31:39 +0200 Subject: [PATCH 086/224] x86/mtrr: Rename mtrr_overwrite_state() to guest_force_mtrr_state() Rename the helper to better reflect its function. Suggested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20241202073139.448208-1-kirill.shutemov@linux.intel.com --- arch/x86/hyperv/ivm.c | 2 +- arch/x86/include/asm/mtrr.h | 10 +++++----- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- arch/x86/kernel/cpu/mtrr/mtrr.c | 2 +- arch/x86/kernel/kvm.c | 2 +- arch/x86/xen/enlighten_pv.c | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 60fc3ed72830..90aabe1fd3b6 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -664,7 +664,7 @@ void __init hv_vtom_init(void) x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; /* Set WB as the default cache mode. */ - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 4218248083d9..c69e269937c5 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -58,8 +58,8 @@ struct mtrr_state_type { */ # ifdef CONFIG_MTRR void mtrr_bp_init(void); -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type); +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -75,9 +75,9 @@ void mtrr_disable(void); void mtrr_enable(void); void mtrr_generic_set_state(void); # else -static inline void mtrr_overwrite_state(struct mtrr_var_range *var, - unsigned int num_var, - mtrr_type def_type) +static inline void guest_force_mtrr_state(struct mtrr_var_range *var, + unsigned int num_var, + mtrr_type def_type) { } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b29ebda024f..2fdfda2b60e4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -423,7 +423,7 @@ void __init mtrr_copy_map(void) } /** - * mtrr_overwrite_state - set static MTRR state + * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). @@ -436,8 +436,8 @@ void __init mtrr_copy_map(void) * @num_var: length of the @var array * @def_type: default caching type */ -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type) +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) { unsigned int i; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 989d368be04f..ecbda0341a8a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -625,7 +625,7 @@ void mtrr_save_state(void) static int __init mtrr_init_finalize(void) { /* - * Map might exist if mtrr_overwrite_state() has been called or if + * Map might exist if guest_force_mtrr_state() has been called or if * mtrr_enabled() returns true. */ mtrr_copy_map(); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 21e9e4845354..7a422a6c5983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -983,7 +983,7 @@ static void __init kvm_init_platform(void) x86_platform.apic_post_init = kvm_apic_init; /* Set WB as the default cache mode for SEV-SNP and TDX */ - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda..633469fab536 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -171,7 +171,7 @@ static void __init xen_set_mtrr_data(void) /* Only overwrite MTRR state if any MTRR could be got from Xen. */ if (reg) - mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); + guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); #endif } @@ -195,7 +195,7 @@ static void __init xen_pv_init_platform(void) if (xen_initial_domain()) xen_set_mtrr_data(); else - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); /* Adjust nr_cpu_ids before "enumeration" happens */ xen_smp_count_cpus(); From 095ac6fa19500fecd7c62e755dee45bb303d4d43 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 23 Nov 2024 19:42:19 +0800 Subject: [PATCH 087/224] x86/ioremap: Simplify setup_data mapping variants memremap_is_setup_data() and early_memremap_is_setup_data() share completely the same process and handling, except for the differing memremap/unmap invocations. Add a helper __memremap_is_setup_data() extracting the common part and simplify a lot of code while at it. Mark __memremap_is_setup_data() as __ref to suppress this section mismatch warning: WARNING: modpost: vmlinux: section mismatch in reference: __memremap_is_setup_data+0x5f (section: .text) -> early_memunmap (section: .init.text) [ bp: Massage a bit. ] Signed-off-by: Baoquan He Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241123114221.149383-2-bhe@redhat.com --- arch/x86/mm/ioremap.c | 110 ++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 73 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8d29163568a7..fe44e8180bdd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -632,71 +632,9 @@ static bool memremap_is_efi_data(resource_size_t phys_addr, * Examine the physical address to determine if it is boot data by checking * it against the boot params setup_data chain. */ -static bool memremap_is_setup_data(resource_size_t phys_addr, - unsigned long size) -{ - struct setup_indirect *indirect; - struct setup_data *data; - u64 paddr, paddr_next; - - paddr = boot_params.hdr.setup_data; - while (paddr) { - unsigned int len; - - if (phys_addr == paddr) - return true; - - data = memremap(paddr, sizeof(*data), - MEMREMAP_WB | MEMREMAP_DEC); - if (!data) { - pr_warn("failed to memremap setup_data entry\n"); - return false; - } - - paddr_next = data->next; - len = data->len; - - if ((phys_addr > paddr) && - (phys_addr < (paddr + sizeof(struct setup_data) + len))) { - memunmap(data); - return true; - } - - if (data->type == SETUP_INDIRECT) { - memunmap(data); - data = memremap(paddr, sizeof(*data) + len, - MEMREMAP_WB | MEMREMAP_DEC); - if (!data) { - pr_warn("failed to memremap indirect setup_data\n"); - return false; - } - - indirect = (struct setup_indirect *)data->data; - - if (indirect->type != SETUP_INDIRECT) { - paddr = indirect->addr; - len = indirect->len; - } - } - - memunmap(data); - - if ((phys_addr > paddr) && (phys_addr < (paddr + len))) - return true; - - paddr = paddr_next; - } - - return false; -} - -/* - * Examine the physical address to determine if it is boot data by checking - * it against the boot params setup_data chain (early boot version). - */ -static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, - unsigned long size) +static bool __ref __memremap_is_setup_data(resource_size_t phys_addr, bool early) { + unsigned int setup_data_sz = sizeof(struct setup_data); struct setup_indirect *indirect; struct setup_data *data; u64 paddr, paddr_next; @@ -708,29 +646,40 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, if (phys_addr == paddr) return true; - data = early_memremap_decrypted(paddr, sizeof(*data)); + if (early) + data = early_memremap_decrypted(paddr, setup_data_sz); + else + data = memremap(paddr, setup_data_sz, MEMREMAP_WB | MEMREMAP_DEC); if (!data) { - pr_warn("failed to early memremap setup_data entry\n"); + pr_warn("failed to remap setup_data entry\n"); return false; } - size = sizeof(*data); + size = setup_data_sz; paddr_next = data->next; len = data->len; if ((phys_addr > paddr) && - (phys_addr < (paddr + sizeof(struct setup_data) + len))) { - early_memunmap(data, sizeof(*data)); + (phys_addr < (paddr + setup_data_sz + len))) { + if (early) + early_memunmap(data, setup_data_sz); + else + memunmap(data); return true; } if (data->type == SETUP_INDIRECT) { size += len; - early_memunmap(data, sizeof(*data)); - data = early_memremap_decrypted(paddr, size); + if (early) { + early_memunmap(data, setup_data_sz); + data = early_memremap_decrypted(paddr, size); + } else { + memunmap(data); + data = memremap(paddr, size, MEMREMAP_WB | MEMREMAP_DEC); + } if (!data) { - pr_warn("failed to early memremap indirect setup_data\n"); + pr_warn("failed to remap indirect setup_data\n"); return false; } @@ -742,7 +691,10 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, } } - early_memunmap(data, size); + if (early) + early_memunmap(data, size); + else + memunmap(data); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) return true; @@ -753,6 +705,18 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, return false; } +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + return __memremap_is_setup_data(phys_addr, false); +} + +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + return __memremap_is_setup_data(phys_addr, true); +} + /* * Architecture function to determine if RAM remap is allowed. By default, a * RAM remap will map the data as encrypted. Determine if a RAM remap should From 525077ae7145cc868b69282f85bed2be8ecd1ed5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sat, 23 Nov 2024 19:42:21 +0800 Subject: [PATCH 088/224] x86/ioremap: Remove unused size parameter in remapping functions The size parameter of functions memremap_is_efi_data(), memremap_is_setup_data() and early_memremap_is_setup_data() is not used. Remove it. [ bp: Massage commit message. ] Signed-off-by: Baoquan He Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241123114221.149383-4-bhe@redhat.com --- arch/x86/mm/ioremap.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fe44e8180bdd..38ff7791a9c7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -593,8 +593,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, * Examine the physical address to determine if it is EFI data. Check * it against the boot params structure and EFI tables and memory types. */ -static bool memremap_is_efi_data(resource_size_t phys_addr, - unsigned long size) +static bool memremap_is_efi_data(resource_size_t phys_addr) { u64 paddr; @@ -705,14 +704,12 @@ static bool __ref __memremap_is_setup_data(resource_size_t phys_addr, bool early return false; } -static bool memremap_is_setup_data(resource_size_t phys_addr, - unsigned long size) +static bool memremap_is_setup_data(resource_size_t phys_addr) { return __memremap_is_setup_data(phys_addr, false); } -static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, - unsigned long size) +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr) { return __memremap_is_setup_data(phys_addr, true); } @@ -735,8 +732,8 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, return false; if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { - if (memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size)) + if (memremap_is_setup_data(phys_addr) || + memremap_is_efi_data(phys_addr)) return false; } @@ -761,8 +758,8 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, encrypted_prot = true; if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { - if (early_memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size)) + if (early_memremap_is_setup_data(phys_addr) || + memremap_is_efi_data(phys_addr)) encrypted_prot = false; } From 7b8a702d943827130cc00ae36075eff5500f86f1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:45:58 +0100 Subject: [PATCH 089/224] sched/fair: Rename h_nr_running into h_nr_queued With delayed dequeued feature, a sleeping sched_entity remains queued in the rq until its lag has elapsed but can't run. Rename h_nr_running into h_nr_queued to reflect this new behavior. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-4-vincent.guittot@linaro.org --- kernel/sched/core.c | 4 +- kernel/sched/debug.c | 6 +-- kernel/sched/fair.c | 88 ++++++++++++++++++++++---------------------- kernel/sched/pelt.c | 4 +- kernel/sched/sched.h | 4 +- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2167d38f4d65..84902936a620 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1343,7 +1343,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.h_nr_running > 1) + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -6020,7 +6020,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a1be00a988bf..08d6c2b7caa3 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_running) { + if (rq->cfs.h_nr_queued) { update_rq_clock(rq); dl_server_stop(&rq->fair_server); } @@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_running) + if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); } @@ -844,7 +844,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f73cb408b29..d6a9447e5e23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2128,7 +2128,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_queued; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -5394,7 +5394,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_queued of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5531,7 +5531,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_queued of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5930,8 +5930,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta, dequeue = 1; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, idle_task_delta, delayed_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5961,7 +5961,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; + queued_delta = cfs_rq->h_nr_queued; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -5983,9 +5983,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; @@ -6006,18 +6006,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); done: /* @@ -6036,8 +6036,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, idle_task_delta, delayed_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6070,7 +6070,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; + queued_delta = cfs_rq->h_nr_queued; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -6086,9 +6086,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; + qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -6104,9 +6104,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; + qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -6116,11 +6116,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_running && rq->cfs.h_nr_running) + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); @@ -6830,7 +6830,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; @@ -6973,7 +6973,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); int h_nr_delayed = 0; int task_new = !(flags & ENQUEUE_WAKEUP); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; /* @@ -7021,7 +7021,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; + cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7045,7 +7045,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; + cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7057,7 +7057,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto enqueue_throttle; } - if (!rq_h_nr_running && rq->cfs.h_nr_running) { + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running) dl_server_update_idle_time(rq, rq->curr); @@ -7104,19 +7104,19 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; int idle_h_nr_running = 0; - int h_nr_running = 0; + int h_nr_queued = 0; int h_nr_delayed = 0; struct cfs_rq *cfs_rq; u64 slice = 0; if (entity_is_task(se)) { p = task_of(se); - h_nr_running = 1; + h_nr_queued = 1; idle_h_nr_running = task_has_idle_policy(p); if (!task_sleep && !task_delayed) h_nr_delayed = !!se->sched_delayed; @@ -7135,12 +7135,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } - cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + idle_h_nr_running = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7174,21 +7174,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + idle_h_nr_running = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) return 0; } - sub_nr_running(rq, h_nr_running); + sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ @@ -10316,7 +10316,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_queued != 1) return false; return check_cpu_capacity(rq, sd); @@ -10351,7 +10351,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_queued; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; @@ -10666,7 +10666,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11448,7 +11448,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_queued; if (!nr_running) continue; @@ -11607,7 +11607,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_running == 1)) { + (env->src_rq->cfs.h_nr_queued == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -12348,7 +12348,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12835,11 +12835,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -13526,7 +13526,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) parent_cfs_rq->idle_nr_running--; } - idle_task_delta = grp_cfs_rq->h_nr_running - + idle_task_delta = grp_cfs_rq->h_nr_queued - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fee75cc2c47b..2bad0b508dfc 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_queued * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 99d19c605e4f..b011081aff97 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -646,7 +646,7 @@ struct balance_callback { struct cfs_rq { struct load_weight load; unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ unsigned int h_nr_delayed; @@ -902,7 +902,7 @@ static inline void se_update_runnable(struct sched_entity *se) if (!entity_is_task(se)) { struct cfs_rq *cfs_rq = se->my_q; - se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; + se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed; } } From c2a295bffeaf9461ecba76dc9e4780c898c94f03 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:45:59 +0100 Subject: [PATCH 090/224] sched/fair: Add new cfs_rq.h_nr_runnable With delayed dequeued feature, a sleeping sched_entity remains queued in the rq until its lag has elapsed. As a result, it stays also visible in the statistics that are used to balance the system and in particular the field cfs.h_nr_queued when the sched_entity is associated to a task. Create a new h_nr_runnable that tracks only queued and runnable tasks. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-5-vincent.guittot@linaro.org --- kernel/sched/debug.c | 1 + kernel/sched/fair.c | 20 ++++++++++++++++++-- kernel/sched/sched.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 08d6c2b7caa3..fd711cc4d44c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -844,6 +844,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6a9447e5e23..ed01e72b2b77 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5469,6 +5469,7 @@ static void set_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_runnable--; cfs_rq->h_nr_delayed++; if (cfs_rq_throttled(cfs_rq)) break; @@ -5481,6 +5482,7 @@ static void clear_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_runnable++; cfs_rq->h_nr_delayed--; if (cfs_rq_throttled(cfs_rq)) break; @@ -5930,7 +5932,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, idle_task_delta, delayed_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_task_delta, delayed_delta, dequeue = 1; long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); @@ -5962,6 +5964,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -5986,6 +5989,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; @@ -6009,6 +6013,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->h_nr_delayed -= delayed_delta; } @@ -6036,7 +6041,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, idle_task_delta, delayed_delta; + long queued_delta, runnable_delta, idle_task_delta, delayed_delta; long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6071,6 +6076,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { @@ -6089,6 +6095,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -6107,6 +6114,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; qcfs_rq->h_nr_delayed += delayed_delta; @@ -7021,6 +7029,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); + if (!h_nr_delayed) + cfs_rq->h_nr_runnable++; cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7045,6 +7055,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); + if (!h_nr_delayed) + cfs_rq->h_nr_runnable++; cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; cfs_rq->h_nr_delayed += h_nr_delayed; @@ -7135,6 +7147,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } + if (!h_nr_delayed) + cfs_rq->h_nr_runnable -= h_nr_queued; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; @@ -7174,6 +7188,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); + if (!h_nr_delayed) + cfs_rq->h_nr_runnable -= h_nr_queued; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; cfs_rq->h_nr_delayed -= h_nr_delayed; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b011081aff97..869d5d3521f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -647,6 +647,7 @@ struct cfs_rq { struct load_weight load; unsigned int nr_running; unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ unsigned int h_nr_delayed; From 1a49104496d38cdcb7d9106ec23773a52c7a7e82 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:00 +0100 Subject: [PATCH 091/224] sched/fair: Use the new cfs_rq.h_nr_runnable Use the new h_nr_runnable that tracks only queued and runnable tasks in the statistics that are used to balance the system: - PELT runnable_avg - deciding if a group is overloaded or has spare capacity - numa stats - reduced capacity management - load balance - nohz kick It should be noticed that the rq->nr_running still counts the delayed dequeued tasks as delayed dequeue is a fair feature that is meaningless at core level. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-6-vincent.guittot@linaro.org --- kernel/sched/fair.c | 18 +++++++++--------- kernel/sched/pelt.c | 4 ++-- kernel/sched/sched.h | 7 ++----- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed01e72b2b77..3a8bdfbf4867 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2128,7 +2128,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_queued; + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -5394,7 +5394,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_queued of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5533,7 +5533,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_queued of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -10332,7 +10332,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_queued != 1) + if (rq->cfs.h_nr_runnable != 1) return false; return check_cpu_capacity(rq, sd); @@ -10367,7 +10367,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_queued; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; @@ -10682,7 +10682,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_queued - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11464,7 +11464,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_queued; + nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue; @@ -11623,7 +11623,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_queued == 1)) { + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -12364,7 +12364,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_queued >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 2bad0b508dfc..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_queued + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 869d5d3521f2..4374c660f5c7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -900,11 +900,8 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) { - struct cfs_rq *cfs_rq = se->my_q; - - se->runnable_weight = cfs_rq->h_nr_queued - cfs_rq->h_nr_delayed; - } + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) From 9216582b0bfb17889eebcf96fb41cd67a3d71133 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:01 +0100 Subject: [PATCH 092/224] sched/fair: Removed unsued cfs_rq.h_nr_delayed h_nr_delayed is not used anymore. We now have: - h_nr_runnable which tracks tasks ready to run - h_nr_queued which tracks enqueued tasks either ready to run or delayed dequeue Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-7-vincent.guittot@linaro.org --- kernel/sched/debug.c | 1 - kernel/sched/fair.c | 40 ++++++++++++---------------------------- kernel/sched/sched.h | 1 - 3 files changed, 12 insertions(+), 30 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fd711cc4d44c..56be3651605d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -846,7 +846,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a8bdfbf4867..5c2f049ca3bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5470,7 +5470,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - cfs_rq->h_nr_delayed++; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5483,7 +5482,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - cfs_rq->h_nr_delayed--; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5932,7 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, runnable_delta, idle_task_delta, delayed_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_task_delta, dequeue = 1; long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); @@ -5966,7 +5964,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) queued_delta = cfs_rq->h_nr_queued; runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5991,7 +5988,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -6015,7 +6011,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->h_nr_runnable -= runnable_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; } /* At this point se is NULL and we are at root level*/ @@ -6041,7 +6036,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, runnable_delta, idle_task_delta, delayed_delta; + long queued_delta, runnable_delta, idle_task_delta; long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6078,7 +6073,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) queued_delta = cfs_rq->h_nr_queued; runnable_delta = cfs_rq->h_nr_runnable; idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -6097,7 +6091,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6116,7 +6109,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->h_nr_runnable += runnable_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6979,7 +6971,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); - int h_nr_delayed = 0; + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; @@ -7006,8 +6998,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); - if (task_new) - h_nr_delayed = !!se->sched_delayed; + if (task_new && se->sched_delayed) + h_nr_runnable = 0; for_each_sched_entity(se) { if (se->on_rq) { @@ -7029,11 +7021,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - if (!h_nr_delayed) - cfs_rq->h_nr_runnable++; + cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7055,11 +7045,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - if (!h_nr_delayed) - cfs_rq->h_nr_runnable++; + cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7122,7 +7110,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) struct task_struct *p = NULL; int idle_h_nr_running = 0; int h_nr_queued = 0; - int h_nr_delayed = 0; + int h_nr_runnable = 0; struct cfs_rq *cfs_rq; u64 slice = 0; @@ -7130,8 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) p = task_of(se); h_nr_queued = 1; idle_h_nr_running = task_has_idle_policy(p); - if (!task_sleep && !task_delayed) - h_nr_delayed = !!se->sched_delayed; + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7147,11 +7135,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } - if (!h_nr_delayed) - cfs_rq->h_nr_runnable -= h_nr_queued; + cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_queued; @@ -7188,11 +7174,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - if (!h_nr_delayed) - cfs_rq->h_nr_runnable -= h_nr_queued; + cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_queued; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4374c660f5c7..d3ce5e99b025 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -650,7 +650,6 @@ struct cfs_rq { unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ - unsigned int h_nr_delayed; s64 avg_vruntime; u64 avg_load; From 31898e7b87dd2833eb5dd6aa60ab2a5880c4c12f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:02 +0100 Subject: [PATCH 093/224] sched/fair: Rename cfs_rq.idle_h_nr_running into h_nr_idle Use same naming convention as others starting with h_nr_* and rename idle_h_nr_running into h_nr_idle. The "running" is not correct anymore as it includes delayed dequeue tasks as well. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-8-vincent.guittot@linaro.org --- kernel/sched/debug.c | 3 +-- kernel/sched/fair.c | 52 ++++++++++++++++++++++---------------------- kernel/sched/sched.h | 2 +- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56be3651605d..e21b66b6ee10 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -848,8 +848,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", - cfs_rq->idle_h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c2f049ca3bf..2ef33784cbf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5930,7 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, runnable_delta, idle_task_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); @@ -5963,7 +5963,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) queued_delta = cfs_rq->h_nr_queued; runnable_delta = cfs_rq->h_nr_runnable; - idle_task_delta = cfs_rq->idle_h_nr_running; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5983,11 +5983,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_queued; + idle_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -6006,11 +6006,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_queued; + idle_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued -= queued_delta; qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ @@ -6036,7 +6036,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long queued_delta, runnable_delta, idle_task_delta; + long queued_delta, runnable_delta, idle_delta; long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6072,7 +6072,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) queued_delta = cfs_rq->h_nr_queued; runnable_delta = cfs_rq->h_nr_runnable; - idle_task_delta = cfs_rq->idle_h_nr_running; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -6086,11 +6086,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_queued; + idle_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6104,11 +6104,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_queued; + idle_delta = cfs_rq->h_nr_queued; qcfs_rq->h_nr_queued += queued_delta; qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6918,7 +6918,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { } /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -6970,7 +6970,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_idle = task_has_idle_policy(p); int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_queued = rq->cfs.h_nr_queued; @@ -7023,10 +7023,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7047,10 +7047,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7108,7 +7108,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; - int idle_h_nr_running = 0; + int h_nr_idle = 0; int h_nr_queued = 0; int h_nr_runnable = 0; struct cfs_rq *cfs_rq; @@ -7117,7 +7117,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (entity_is_task(se)) { p = task_of(se); h_nr_queued = 1; - idle_h_nr_running = task_has_idle_policy(p); + h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; } else { @@ -7137,10 +7137,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_queued; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7176,10 +7176,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_queued; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -13527,7 +13527,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) } idle_task_delta = grp_cfs_rq->h_nr_queued - - grp_cfs_rq->idle_h_nr_running; + grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; @@ -13537,7 +13537,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (!se->on_rq) break; - cfs_rq->idle_h_nr_running += idle_task_delta; + cfs_rq->h_nr_idle += idle_task_delta; /* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d3ce5e99b025..afe5cb93db89 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -649,7 +649,7 @@ struct cfs_rq { unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; From 43eef7c3a4a65e258244d63a8992d0a8d70e5974 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:03 +0100 Subject: [PATCH 094/224] sched/fair: Remove unused cfs_rq.idle_nr_running cfs_rq.idle_nr_running field is not used anywhere so we can remove the useless associated computation. Last user went in commit 5e963f2bd465 ("sched/fair: Commit to EEVDF"). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-9-vincent.guittot@linaro.org --- kernel/sched/debug.c | 2 -- kernel/sched/fair.c | 14 +------------- kernel/sched/sched.h | 1 - 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e21b66b6ee10..e300ee4d7956 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -846,8 +846,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); - SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", - cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ef33784cbf5..8afa0a4ed09f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3674,8 +3674,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; - if (se_is_idle(se)) - cfs_rq->idle_nr_running++; } static void @@ -3689,8 +3687,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; - if (se_is_idle(se)) - cfs_rq->idle_nr_running--; } /* @@ -13507,7 +13503,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13518,14 +13514,6 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; - if (se->on_rq) { - parent_cfs_rq = cfs_rq_of(se); - if (cfs_rq_is_idle(grp_cfs_rq)) - parent_cfs_rq->idle_nr_running++; - else - parent_cfs_rq->idle_nr_running--; - } - idle_task_delta = grp_cfs_rq->h_nr_queued - grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index afe5cb93db89..9a9220aad9fc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -648,7 +648,6 @@ struct cfs_rq { unsigned int nr_running; unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; From 736c55a02c477ad31c57ae4c69130f437855e051 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:04 +0100 Subject: [PATCH 095/224] sched/fair: Rename cfs_rq.nr_running into nr_queued Rename cfs_rq.nr_running into cfs_rq.nr_queued which better reflects the reality as the value includes both the ready to run tasks and the delayed dequeue tasks. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-10-vincent.guittot@linaro.org --- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 38 +++++++++++++++++++------------------- kernel/sched/sched.h | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e300ee4d7956..5e8e84a2bcb1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -843,7 +843,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8afa0a4ed09f..84c0191a8ffa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -915,7 +915,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) @@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -3673,7 +3673,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } static void @@ -3686,7 +3686,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; + cfs_rq->nr_queued--; } /* @@ -5220,7 +5220,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return !cfs_rq->nr_running; + return !cfs_rq->nr_queued; } #define UPDATE_TG 0x0 @@ -5276,7 +5276,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5423,7 +5423,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); @@ -5565,7 +5565,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); return true; @@ -5913,7 +5913,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) list_del_leaf_cfs_rq(cfs_rq); SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -6022,7 +6022,7 @@ done: */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -6122,7 +6122,7 @@ unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) + if (rq->curr == rq->idle && rq->cfs.nr_queued) resched_curr(rq); } @@ -6423,7 +6423,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -6941,14 +6941,14 @@ requeue_delayed_entity(struct sched_entity *se) if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); if (se->vlag > 0) { - cfs_rq->nr_running--; + cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } } @@ -8873,7 +8873,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) again: cfs_rq = &rq->cfs; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_queued) return NULL; do { @@ -8990,7 +8990,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) { - return !!dl_se->rq->cfs.nr_running; + return !!dl_se->rq->cfs.nr_queued; } static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) @@ -9780,7 +9780,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) @@ -12949,7 +12949,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -13093,7 +13093,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a9220aad9fc..aef716c41edb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -645,7 +645,7 @@ struct balance_callback { /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; + unsigned int nr_queued; unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int h_nr_idle; /* SCHED_IDLE */ @@ -2565,7 +2565,7 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); From 61b82dfb6b7e1f951fd1e95198a2aee2ccf6a167 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:05 +0100 Subject: [PATCH 096/224] sched/fair: Do not try to migrate delayed dequeue task Migrating a delayed dequeued task doesn't help in balancing the number of runnable tasks in the system. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-11-vincent.guittot@linaro.org --- kernel/sched/fair.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84c0191a8ffa..2aa1d0cb6821 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9391,11 +9391,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; From 0429489e092851f066b08deed9ce0f3910515383 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Dec 2024 18:46:06 +0100 Subject: [PATCH 097/224] sched/fair: Fix variable declaration position Move variable declaration at the beginning of the function Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20241202174606.4074512-12-vincent.guittot@linaro.org --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2aa1d0cb6821..04db7e4b2607 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5494,6 +5494,7 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; update_curr(cfs_rq); clear_buddies(cfs_rq, se); @@ -5519,7 +5520,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } - int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; @@ -5627,6 +5627,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); static struct sched_entity * pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { + struct sched_entity *se; + /* * Enabling NEXT_BUDDY will affect latency but not fairness. */ @@ -5637,7 +5639,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) return cfs_rq->next; } - struct sched_entity *se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* From 95d9fed3a2aea85fe9551c2f007e186d4abb4a2a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Dec 2024 19:35:30 +0200 Subject: [PATCH 098/224] sched/fair: Mark m*_vruntime() with __maybe_unused When max_vruntime() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: kernel/sched/fair.c:526:19: error: unused function 'max_vruntime' [-Werror,-Wunused-function] 526 | static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) | ^~~~~~~~~~~~ Fix this by marking them with __maybe_unused (all cases for the sake of symmetry). See also commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). Signed-off-by: Andy Shevchenko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241202173546.634433-1-andriy.shevchenko@linux.intel.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04db7e4b2607..b505d3dba2c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -523,7 +523,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -532,7 +532,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) From 2a77e4be12cb58bbf774e7c717c8bb80e128b7a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2024 11:15:41 +0100 Subject: [PATCH 099/224] sched/fair: Untangle NEXT_BUDDY and pick_next_task() There are 3 sites using set_next_buddy() and only one is conditional on NEXT_BUDDY, the other two sites are unconditional; to note: - yield_to_task() - cgroup dequeue / pick optimization However, having NEXT_BUDDY control both the wakeup-preemption and the picking side of things means its near useless. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241129101541.GA33464@noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 4 ++-- kernel/sched/features.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b505d3dba2c8..2c4ebfc82917 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5630,9 +5630,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se; /* - * Enabling NEXT_BUDDY will affect latency but not fairness. + * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(NEXT_BUDDY) && + if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ SCHED_WARN_ON(cfs_rq->next->sched_delayed); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3d331dd2d8f..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -31,6 +31,15 @@ SCHED_FEAT(PREEMPT_SHORT, true) */ SCHED_FEAT(NEXT_BUDDY, false) +/* + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick + */ +SCHED_FEAT(PICK_BUDDY, true) + /* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. From 2ff913ab3f472321ac1931b663314edd6c211a0c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:14 -0800 Subject: [PATCH 100/224] uprobes: Simplify session consumer tracking In practice, each return_instance will typically contain either zero or one return_consumer, depending on whether it has any uprobe session consumer attached or not. It's highly unlikely that more than one uprobe session consumers will be attached to any given uprobe, so there is no need to optimize for that case. But the way we currently do memory allocation and accounting is by pre-allocating the space for 4 session consumers in contiguous block of memory next to struct return_instance fixed part. This is unnecessarily wasteful. This patch changes this to keep struct return_instance fixed-sized with one pre-allocated return_consumer, while (in a highly unlikely scenario) allowing for more session consumers in a separate dynamically allocated and reallocated array. We also simplify accounting a bit by not maintaining a separate temporary capacity for consumers array, and, instead, relying on krealloc() to be a no-op if underlying memory can accommodate a slightly bigger allocation (but again, it's very uncommon scenario to even have to do this reallocation). All this gets rid of ri_size(), simplifies push_consumer() and removes confusing ri->consumers_cnt re-assignment, while containing this singular preallocated consumer logic contained within a few simple preexisting helpers. Having fixed-sized struct return_instance simplifies and speeds up return_instance reuse that we ultimately add later in this patch set, see follow up patches. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-2-andrii@kernel.org --- include/linux/uprobes.h | 10 ++++-- kernel/events/uprobes.c | 72 +++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e0a4c2082245..1d449978558d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -154,12 +154,18 @@ struct return_instance { unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ - int consumers_cnt; + int cons_cnt; /* total number of session consumers */ struct return_instance *next; /* keep as stack */ struct rcu_head rcu; - struct return_consumer consumers[] __counted_by(consumers_cnt); + /* singular pre-allocated return_consumer instance for common case */ + struct return_consumer consumer; + /* + * extra return_consumer instances for rare cases of multiple session consumers, + * contains (cons_cnt - 1) elements + */ + struct return_consumer *extra_consumers; } ____cacheline_aligned; enum rp_check { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index daf4314961ab..6beac52239be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1899,6 +1899,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo hprobe_finalize(&ri->hprobe, hstate); } + kfree(ri->extra_consumers); kfree_rcu(ri, rcu); return next; } @@ -1974,32 +1975,34 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static size_t ri_size(int consumers_cnt) -{ - struct return_instance *ri; - - return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; -} - -#define DEF_CNT 4 - static struct return_instance *alloc_return_instance(void) { struct return_instance *ri; - ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; - ri->consumers_cnt = DEF_CNT; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { - size_t size = ri_size(old->consumers_cnt); + struct return_instance *ri; - return kmemdup(old, size, GFP_KERNEL); + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } + + return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) @@ -2369,25 +2372,28 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } -static struct return_instance* -push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { + struct return_consumer *ric; + if (unlikely(ri == ZERO_SIZE_PTR)) return ri; - if (unlikely(idx >= ri->consumers_cnt)) { - struct return_instance *old_ri = ri; - - ri->consumers_cnt += DEF_CNT; - ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); - if (!ri) { - kfree(old_ri); + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); return ZERO_SIZE_PTR; } + ri->extra_consumers = ric; } - ri->consumers[idx].id = id; - ri->consumers[idx].cookie = cookie; + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; return ri; } @@ -2395,14 +2401,17 @@ static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; - int idx = *iter; + int idx; - for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } + return NULL; } @@ -2416,7 +2425,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; - int push_idx = 0; current->utask->auprobe = &uprobe->arch; @@ -2441,18 +2449,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) ri = alloc_return_instance(); if (session) - ri = push_consumer(ri, push_idx++, uc->id, cookie); + ri = push_consumer(ri, uc->id, cookie); } current->utask->auprobe = NULL; - if (!ZERO_OR_NULL_PTR(ri)) { - /* - * The push_idx value has the final number of return consumers, - * and ri->consumers_cnt has number of allocated consumers. - */ - ri->consumers_cnt = push_idx; + if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); - } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); From 636666a1c73313a0cc9a0a6671c29e2d6ebe16fb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:15 -0800 Subject: [PATCH 101/224] uprobes: Decouple return_instance list traversal and freeing free_ret_instance() has two unrelated responsibilities: actually cleaning up return_instance's resources and freeing memory, and also helping with utask->return_instances list traversal by returning the next alive pointer. There is no reason why these two aspects have to be mixed together, so turn free_ret_instance() into void-returning function and make callers do list traversal on their own. We'll use this simplification in the next patch that will guarantee that to-be-freed return_instance isn't reachable from utask->return_instances list. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-3-andrii@kernel.org --- kernel/events/uprobes.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6beac52239be..cca1fe4a3fb1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1888,10 +1888,8 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) +static void free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) { - struct return_instance *next = ri->next; - if (cleanup_hprobe) { enum hprobe_state hstate; @@ -1901,7 +1899,6 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo kfree(ri->extra_consumers); kfree_rcu(ri, rcu); - return next; } /* @@ -1911,7 +1908,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri; + struct return_instance *ri, *ri_next; if (!utask) return; @@ -1921,8 +1918,11 @@ void uprobe_free_utask(struct task_struct *t) timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; - while (ri) - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + while (ri) { + ri_next = ri->next; + free_ret_instance(ri, true /* cleanup_hprobe */); + ri = ri_next; + } kfree(utask); t->utask = NULL; @@ -2111,12 +2111,15 @@ unsigned long uprobe_get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { - struct return_instance *ri = utask->return_instances; + struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + ri_next = ri->next; utask->depth--; + + free_ret_instance(ri, true /* cleanup_hprobe */); + ri = ri_next; } rcu_assign_pointer(utask->return_instances, ri); } @@ -2508,7 +2511,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *next; + struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; @@ -2528,8 +2531,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ - next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -2541,7 +2544,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - rcu_assign_pointer(utask->return_instances, ri->next); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) @@ -2549,9 +2554,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ - ri = free_ret_instance(ri, false /* !cleanup_hprobe */); - utask->depth--; - } while (ri != next); + free_ret_instance(ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); } while (!valid); return; From 0cf981de7687b26ccc9bd4e6daa8fa6b177f91a9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:16 -0800 Subject: [PATCH 102/224] uprobes: Ensure return_instance is detached from the list before freeing Ensure that by the time we call free_ret_instance() to clean up an instance of struct return_instance it isn't reachable from utask->return_instances anymore. free_ret_instance() is called in a few different situations, all but one of which already are fine w.r.t. return_instance visibility: - uprobe_free_utask() guarantees that ri_timer() won't be called (through timer_delete_sync() call), and so there is no need to unlink anything, because entire utask is being freed; - uprobe_handle_trampoline() is already unlinking to-be-freed return_instance with rcu_assign_pointer() before calling free_ret_instance(). Only cleanup_return_instances() violates this property, which so far is not causing problems due to RCU-delayed freeing of return_instance, which we'll change in the next patch. So make sure we unlink return_instance before passing it into free_ret_instance(), as otherwise reuse will be unsafe. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-4-andrii@kernel.org --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cca1fe4a3fb1..2345aeb63d3b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2116,12 +2116,12 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained, while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(ri, true /* cleanup_hprobe */); ri = ri_next; } - rcu_assign_pointer(utask->return_instances, ri); } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, From 8622e45b5da17e777e0e45f16296072494452318 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:17 -0800 Subject: [PATCH 103/224] uprobes: Reuse return_instances between multiple uretprobes within task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of constantly allocating and freeing very short-lived struct return_instance, reuse it as much as possible within current task. For that, store a linked list of reusable return_instances within current->utask. The only complication is that ri_timer() might be still processing such return_instance. And so while the main uretprobe processing logic might be already done with return_instance and would be OK to immediately reuse it for the next uretprobe instance, it's not correct to unconditionally reuse it just like that. Instead we make sure that ri_timer() can't possibly be processing it by using seqcount_t, with ri_timer() being "a writer", while free_ret_instance() being "a reader". If, after we unlink return instance from utask->return_instances list, we know that ri_timer() hasn't gotten to processing utask->return_instances yet, then we can be sure that immediate return_instance reuse is OK, and so we put it onto utask->ri_pool for future (potentially, almost immediate) reuse. This change shows improvements both in single CPU performance (by avoiding relatively expensive kmalloc/free combon) and in terms of multi-CPU scalability, where you can see that per-CPU throughput doesn't decline as steeply with increased number of CPUs (which were previously attributed to kmalloc()/free() through profiling): BASELINE (latest perf/core) =========================== uretprobe-nop ( 1 cpus): 1.898 ± 0.002M/s ( 1.898M/s/cpu) uretprobe-nop ( 2 cpus): 3.574 ± 0.011M/s ( 1.787M/s/cpu) uretprobe-nop ( 3 cpus): 5.279 ± 0.066M/s ( 1.760M/s/cpu) uretprobe-nop ( 4 cpus): 6.824 ± 0.047M/s ( 1.706M/s/cpu) uretprobe-nop ( 5 cpus): 8.339 ± 0.060M/s ( 1.668M/s/cpu) uretprobe-nop ( 6 cpus): 9.812 ± 0.047M/s ( 1.635M/s/cpu) uretprobe-nop ( 7 cpus): 11.030 ± 0.048M/s ( 1.576M/s/cpu) uretprobe-nop ( 8 cpus): 12.453 ± 0.126M/s ( 1.557M/s/cpu) uretprobe-nop (10 cpus): 14.838 ± 0.044M/s ( 1.484M/s/cpu) uretprobe-nop (12 cpus): 17.092 ± 0.115M/s ( 1.424M/s/cpu) uretprobe-nop (14 cpus): 19.576 ± 0.022M/s ( 1.398M/s/cpu) uretprobe-nop (16 cpus): 22.264 ± 0.015M/s ( 1.391M/s/cpu) uretprobe-nop (24 cpus): 33.534 ± 0.078M/s ( 1.397M/s/cpu) uretprobe-nop (32 cpus): 43.262 ± 0.127M/s ( 1.352M/s/cpu) uretprobe-nop (40 cpus): 53.252 ± 0.080M/s ( 1.331M/s/cpu) uretprobe-nop (48 cpus): 55.778 ± 0.045M/s ( 1.162M/s/cpu) uretprobe-nop (56 cpus): 56.850 ± 0.227M/s ( 1.015M/s/cpu) uretprobe-nop (64 cpus): 62.005 ± 0.077M/s ( 0.969M/s/cpu) uretprobe-nop (72 cpus): 66.445 ± 0.236M/s ( 0.923M/s/cpu) uretprobe-nop (80 cpus): 68.353 ± 0.180M/s ( 0.854M/s/cpu) THIS PATCHSET (on top of latest perf/core) ========================================== uretprobe-nop ( 1 cpus): 2.253 ± 0.004M/s ( 2.253M/s/cpu) uretprobe-nop ( 2 cpus): 4.281 ± 0.003M/s ( 2.140M/s/cpu) uretprobe-nop ( 3 cpus): 6.389 ± 0.027M/s ( 2.130M/s/cpu) uretprobe-nop ( 4 cpus): 8.328 ± 0.005M/s ( 2.082M/s/cpu) uretprobe-nop ( 5 cpus): 10.353 ± 0.001M/s ( 2.071M/s/cpu) uretprobe-nop ( 6 cpus): 12.513 ± 0.010M/s ( 2.086M/s/cpu) uretprobe-nop ( 7 cpus): 14.525 ± 0.017M/s ( 2.075M/s/cpu) uretprobe-nop ( 8 cpus): 15.633 ± 0.013M/s ( 1.954M/s/cpu) uretprobe-nop (10 cpus): 19.532 ± 0.011M/s ( 1.953M/s/cpu) uretprobe-nop (12 cpus): 21.405 ± 0.009M/s ( 1.784M/s/cpu) uretprobe-nop (14 cpus): 24.857 ± 0.020M/s ( 1.776M/s/cpu) uretprobe-nop (16 cpus): 26.466 ± 0.018M/s ( 1.654M/s/cpu) uretprobe-nop (24 cpus): 40.513 ± 0.222M/s ( 1.688M/s/cpu) uretprobe-nop (32 cpus): 54.180 ± 0.074M/s ( 1.693M/s/cpu) uretprobe-nop (40 cpus): 66.100 ± 0.082M/s ( 1.652M/s/cpu) uretprobe-nop (48 cpus): 70.544 ± 0.068M/s ( 1.470M/s/cpu) uretprobe-nop (56 cpus): 74.494 ± 0.055M/s ( 1.330M/s/cpu) uretprobe-nop (64 cpus): 79.317 ± 0.029M/s ( 1.239M/s/cpu) uretprobe-nop (72 cpus): 84.875 ± 0.020M/s ( 1.179M/s/cpu) uretprobe-nop (80 cpus): 92.318 ± 0.224M/s ( 1.154M/s/cpu) For reference, with uprobe-nop we hit the following throughput: uprobe-nop (80 cpus): 143.485 ± 0.035M/s ( 1.794M/s/cpu) So now uretprobe stays a bit closer to that performance. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-5-andrii@kernel.org --- include/linux/uprobes.h | 6 ++- kernel/events/uprobes.c | 83 ++++++++++++++++++++++++++++++++++------- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 1d449978558d..b1df7d792fa1 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -124,6 +125,10 @@ struct uprobe_task { unsigned int depth; struct return_instance *return_instances; + struct return_instance *ri_pool; + struct timer_list ri_timer; + seqcount_t ri_seqcount; + union { struct { struct arch_uprobe_task autask; @@ -137,7 +142,6 @@ struct uprobe_task { }; struct uprobe *active_uprobe; - struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2345aeb63d3b..1af950208c2b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1888,8 +1888,34 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static void free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) +{ + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; + if (cleanup_hprobe) { enum hprobe_state hstate; @@ -1897,8 +1923,22 @@ static void free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) hprobe_finalize(&ri->hprobe, hstate); } - kfree(ri->extra_consumers); - kfree_rcu(ri, rcu); + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } } /* @@ -1920,7 +1960,15 @@ void uprobe_free_utask(struct task_struct *t) ri = utask->return_instances; while (ri) { ri_next = ri->next; - free_ret_instance(ri, true /* cleanup_hprobe */); + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } + + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); ri = ri_next; } @@ -1943,8 +1991,12 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); + write_seqcount_begin(&utask->ri_seqcount); + for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); + + write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) @@ -1956,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); return utask; } @@ -1975,10 +2028,14 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static struct return_instance *alloc_return_instance(void) +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; + ri = ri_pool_pop(utask); + if (ri) + return ri; + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; @@ -2119,7 +2176,7 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained, rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; - free_ret_instance(ri, true /* cleanup_hprobe */); + free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } @@ -2186,7 +2243,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, return; free: - kfree(ri); + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ @@ -2385,8 +2442,7 @@ static struct return_instance *push_consumer(struct return_instance *ri, __u64 i if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { - kfree(ri->extra_consumers); - kfree_rcu(ri, rcu); + ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; @@ -2428,8 +2484,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; + struct uprobe_task *utask = current->utask; - current->utask->auprobe = &uprobe->arch; + utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; @@ -2449,12 +2506,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) continue; if (!ri) - ri = alloc_return_instance(); + ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } - current->utask->auprobe = NULL; + utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); @@ -2554,7 +2611,7 @@ void uprobe_handle_trampoline(struct pt_regs *regs) hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ - free_ret_instance(ri, false /* !cleanup_hprobe */); + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); From 6057b90ecc84f232dd32a047a086a4c4c271765f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:40 -0800 Subject: [PATCH 104/224] perf/core: Export perf_exclude_event() While at it, rename the same function in s390 cpum_sf PMU. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Acked-by: Thomas Richter Link: https://lore.kernel.org/r/20241203180441.1634709-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_sf.c | 6 +++--- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 3 +-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 1e99514fb7ae..5f60248cb468 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; if (perf_event_overflow(event, &data, ®s)) { overflow = 1; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf831b1485ff..8333f132f4a9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1690,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1895,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/kernel/events/core.c b/kernel/events/core.c index e9f698c08dc1..b2bc67791f84 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10039,8 +10039,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; From d29e744c71673a71da8f8522799ee02744cad6c9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:41 -0800 Subject: [PATCH 105/224] perf/x86: Relax privilege filter restriction on AMD IBS While IBS is available for per-thread profiling, still regular users cannot open an event due to the default paranoid setting (2) which doesn't allow unprivileged users to get kernel samples. That means it needs to set exclude_kernel bit in the attribute but IBS driver would reject it since it has PERF_PMU_CAP_NO_EXCLUDE. This is not what we want and I've been getting requests to fix this issue. This should be done in the hardware, but until we get the HW fix we may allow exclude_{kernel,user,hv} in the attribute and silently drop the samples in the PMU IRQ handler. It won't guarantee the sampling frequency or even it'd miss some with fixed period too. Not ideal, but that'd still be helpful to regular users. To minimize the confusion, let's add 'swfilt' bit to attr.config2 which is exposed in the sysfs format directory so that users can figure out if the kernel support the privilege filters by software. $ perf record -e ibs_op/swfilt=1/u true This uses perf_exclude_event() which checks regs->cs. But it should be fine because set_linear_ip() also updates the CS according to the RIP provided by IBS. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241203180441.1634709-3-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 59 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index f02939655b2a..e7a8b8758e08 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -31,6 +31,8 @@ static u32 ibs_caps; #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT +/* attr.config2 */ +#define IBS_SW_FILTER_MASK 1 /* * IBS states: @@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event) if (has_branch_stack(event)) return -EOPNOTSUPP; + /* handle exclude_{user,kernel} in the IRQ handler */ + if (event->attr.exclude_host || event->attr.exclude_guest || + event->attr.exclude_idle) + return -EINVAL; + + if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && + (event->attr.exclude_kernel || event->attr.exclude_user || + event->attr.exclude_hv)) + return -EINVAL; + ret = validate_group(event); if (ret) return ret; @@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = { NULL, }; -static struct attribute_group empty_format_group = { - .name = "format", - .attrs = attrs_empty, -}; - static struct attribute_group empty_caps_group = { .name = "caps", .attrs = attrs_empty, }; -static const struct attribute_group *empty_attr_groups[] = { - &empty_format_group, - &empty_caps_group, - NULL, -}; - PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); @@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } -static struct attribute *rand_en_attrs[] = { +static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, + &format_attr_swfilt.attr, NULL, }; @@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; -static struct attribute_group group_rand_en = { +static struct attribute_group group_fetch_formats = { .name = "format", - .attrs = rand_en_attrs, + .attrs = fetch_attrs, }; static struct attribute_group group_fetch_l3missonly = { @@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = { }; static const struct attribute_group *fetch_attr_groups[] = { - &group_rand_en, + &group_fetch_formats, &empty_caps_group, NULL, }; @@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; } +static struct attribute *op_attrs[] = { + &format_attr_swfilt.attr, + NULL, +}; + static struct attribute *cnt_ctl_attrs[] = { &format_attr_cnt_ctl.attr, NULL, @@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = { NULL, }; +static struct attribute_group group_op_formats = { + .name = "format", + .attrs = op_attrs, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static const struct attribute_group *op_attr_groups[] = { + &group_op_formats, + &empty_caps_group, + NULL, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, @@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_CONFIG_MASK, @@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, @@ -1111,6 +1128,12 @@ fail: regs.flags |= PERF_EFLAGS_EXACT; } + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && + perf_exclude_event(event, ®s)) { + throttle = perf_event_account_interrupt(event); + goto out; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ .frag = { @@ -1227,7 +1250,7 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_ZEN4) perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; - perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_groups = op_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); From 02c56362a7d3eccc209d5c00d73a06513d2504d5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 6 Dec 2024 10:34:36 -0800 Subject: [PATCH 106/224] uprobes: Guard against kmemdup() failing in dup_return_instance() If kmemdup() failed to alloc memory, don't proceed with extra_consumers copy. Fixes: e62f2d492728 ("uprobes: Simplify session consumer tracking") Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241206183436.968068-1-andrii@kernel.org --- kernel/events/uprobes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1af950208c2b..1f75a2f91206 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2048,6 +2048,8 @@ static struct return_instance *dup_return_instance(struct return_instance *old) struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, From 135eef38d7e081303fd9cdb982b37fcad32f9be0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 7 Aug 2024 18:02:08 +0200 Subject: [PATCH 107/224] x86/resctrl: Use kthread_run_on_cpu() Use the proper API instead of open coding it. Signed-off-by: Frederic Weisbecker Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/20240807160228.26206-3-frederic@kernel.org --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 28 +++++++---------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 972e6b6b0481..6c60c16a9dd0 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1205,20 +1205,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) plr->cpu = cpu; if (sel == 1) - thread = kthread_create_on_node(measure_cycles_lat_fn, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr, + cpu, "pseudo_lock_measure/%u"); else if (sel == 2) - thread = kthread_create_on_node(measure_l2_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_l2_residency, plr, + cpu, "pseudo_lock_measure/%u"); else if (sel == 3) - thread = kthread_create_on_node(measure_l3_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_l3_residency, plr, + cpu, "pseudo_lock_measure/%u"); else goto out; @@ -1226,8 +1220,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) ret = PTR_ERR(thread); goto out; } - kthread_bind(thread, cpu); - wake_up_process(thread); ret = wait_event_interruptible(plr->lock_thread_wq, plr->thread_done == 1); @@ -1315,18 +1307,14 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) plr->thread_done = 0; - thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, - cpu_to_node(plr->cpu), - "pseudo_lock/%u", plr->cpu); + thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp, + plr->cpu, "pseudo_lock/%u"); if (IS_ERR(thread)) { ret = PTR_ERR(thread); rdt_last_cmd_printf("Locking thread returned error %d\n", ret); goto out_cstates; } - kthread_bind(thread, plr->cpu); - wake_up_process(thread); - ret = wait_event_interruptible(plr->lock_thread_wq, plr->thread_done == 1); if (ret < 0) { From 2937f9c361f7a8b230cd599e4af5264798bf4ce7 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 6 Dec 2024 08:31:41 -0800 Subject: [PATCH 108/224] x86/resctrl: Introduce resctrl_file_fflags_init() to initialize fflags thread_throttle_mode_init() and mbm_config_rftype_init() both initialize fflags for resctrl files. Adding new files will involve adding another function to initialize the fflags. This can be simplified by adding a new function resctrl_file_fflags_init() and passing the file name and flags to be initialized. Consolidate fflags initialization into resctrl_file_fflags_init() and remove thread_throttle_mode_init() and mbm_config_rftype_init(). [ Tony: Drop __init attribute so resctrl_file_fflags_init() can be used at run time. ] Signed-off-by: Babu Moger Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/r/20241206163148.83828-2-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 4 +++- arch/x86/kernel/cpu/resctrl/internal.h | 3 +-- arch/x86/kernel/cpu/resctrl/monitor.c | 6 ++++-- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 15 ++------------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b681c2e07dbf..f3ee5859b69d 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -234,7 +234,9 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; else r->membw.throttle_mode = THREAD_THROTTLE_MAX; - thread_throttle_mode_init(); + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); r->alloc_capable = true; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 955999aecfca..faaff9d64102 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -647,8 +647,7 @@ void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_mon_domain *d); void __check_limbo(struct rdt_mon_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void __init thread_throttle_mode_init(void); -void __init mbm_config_rftype_init(const char *config); +void resctrl_file_fflags_init(const char *config, unsigned long fflags); void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 5fcb3d635d91..69bdc11bacf8 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -1224,11 +1224,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; - mbm_config_rftype_init("mbm_total_bytes_config"); + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { mbm_local_event.configurable = true; - mbm_config_rftype_init("mbm_local_bytes_config"); + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d906a1cd8491..d333570e893d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2020,24 +2020,13 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name) return NULL; } -void __init thread_throttle_mode_init(void) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); - if (!rft) - return; - - rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; -} - -void __init mbm_config_rftype_init(const char *config) +void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = fflags; } /** From 3b49c37a2f4657730dd38a050b9d221363889dea Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:42 -0800 Subject: [PATCH 109/224] x86/resctrl: Prepare for per-CTRL_MON group mba_MBps control Resctrl uses local memory bandwidth event as input to the feedback loop when the mba_MBps mount option is used. This means that this mount option cannot be used on systems that only support monitoring of total bandwidth. Prepare to allow users to choose the input event independently for each CTRL_MON group by adding a global variable "mba_mbps_default_event" used to set the default event for each CTRL_MON group, and a new field "mba_mbps_event" in struct rdtgroup to track which event is used for each CTRL_MON group. Notes: 1) Both of these are only used when the user mounts the filesystem with the "mba_MBps" option. 2) Only check for support of local bandwidth event when initializing mba_mbps_default_event. Support for total bandwidth event can be added after other routines in resctrl have been updated to handle total bandwidth event. [ bp: Move mba_mbps_default_event extern into the arch header. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-3-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 3 +++ arch/x86/kernel/cpu/resctrl/internal.h | 4 +++- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 13 +++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index f3ee5859b69d..94bf559966d6 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -963,6 +963,9 @@ static __init bool get_rdt_mon_resources(void) if (!rdt_mon_features) return false; + if (is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + return !rdt_get_mon_l3_config(r); } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index faaff9d64102..542d01c055aa 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -283,6 +283,7 @@ struct pseudo_lock_region { * monitor only or ctrl_mon group * @mon: mongroup related data * @mode: mode of resource group + * @mba_mbps_event: input monitoring event id when mba_sc is enabled * @plr: pseudo-locked region */ struct rdtgroup { @@ -295,6 +296,7 @@ struct rdtgroup { enum rdt_group_type type; struct mongroup mon; enum rdtgrp_mode mode; + enum resctrl_event_id mba_mbps_event; struct pseudo_lock_region *plr; }; @@ -508,6 +510,7 @@ extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; extern struct dentry *debugfs_resctrl; +extern enum resctrl_event_id mba_mbps_default_event; enum resctrl_res_level { RDT_RESOURCE_L3, @@ -651,5 +654,4 @@ void resctrl_file_fflags_init(const char *config, unsigned long fflags); void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); - #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d333570e893d..8a52b25ce26b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -65,6 +65,15 @@ static void rdtgroup_destroy_root(void); struct dentry *debugfs_resctrl; +/* + * Memory bandwidth monitoring event to use for the default CTRL_MON group + * and each new CTRL_MON group created by the user. Only relevant when + * the filesystem is mounted with the "mba_MBps" option so it does not + * matter that it remains uninitialized on systems that do not support + * the "mba_MBps" option. + */ +enum resctrl_event_id mba_mbps_default_event; + static bool resctrl_debug; void rdt_last_cmd_clear(void) @@ -2353,6 +2362,8 @@ static int set_mba_sc(bool mba_sc) r->membw.mba_sc = mba_sc; + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { for (i = 0; i < num_closid; i++) d->mbps_val[i] = MBA_MAX_MBPS; @@ -3611,6 +3622,8 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, rdt_last_cmd_puts("kernfs subdir error\n"); goto out_del_list; } + if (is_mba_sc(NULL)) + rdtgrp->mba_mbps_event = mba_mbps_default_event; } goto out_unlock; From 481d363748b2df881df21569f3697b3c7fcf8fc1 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:43 -0800 Subject: [PATCH 110/224] x86/resctrl: Modify update_mba_bw() to use per CTRL_MON group event update_mba_bw() hard codes use of the memory bandwidth local event which prevents more flexible options from being deployed. Change this function to use the event specified in the rdtgroup that is being processed. Mount time checks for the "mba_MBps" option ensure that local memory bandwidth is enabled. So drop the redundant is_mbm_local_enabled() check. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-4-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 69bdc11bacf8..adb18f088979 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -752,20 +752,20 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; struct rdt_resource *r_mba; - u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; - - if (!is_mbm_local_enabled()) - return; + u32 cur_bw, user_bw; r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + evt_id = rgrp->mba_mbps_event; closid = rgrp->closid; rmid = rgrp->mon.rmid; - idx = resctrl_arch_rmid_idx_encode(closid, rmid); - pmbm_data = &dom_mbm->mbm_local[idx]; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { @@ -784,7 +784,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) */ head = &rgrp->mon.crdtgrp_list; list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; cur_bw += cmbm_data->prev_bw; } From 35aafa1d41cee0d3d50164561bca34befc1d9ce3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 9 Dec 2024 10:41:06 +0100 Subject: [PATCH 111/224] x86/boot/64: Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 In __startup_64(), the bool 'la57' can only assume the 'true' value if CONFIG_X86_5LEVEL is enabled in the build, and generally, the compiler can make this inference at build time, and elide any references to the symbol 'level4_kernel_pgt', which may be undefined if 'la57' is false. As it turns out, GCC 12 gets this wrong sometimes, and gives up with a build error: ld: arch/x86/kernel/head64.o: in function `__startup_64': head64.c:(.head.text+0xbd): undefined reference to `level4_kernel_pgt' even though the reference is in unreachable code. Fix this by duplicating the IS_ENABLED(CONFIG_X86_5LEVEL) in the conditional that tests the value of 'la57'. Reported-by: kernel test robot Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241209094105.762857-2-ardb+git@google.com Closes: https://lore.kernel.org/oe-kbuild-all/202412060403.efD8Kgb7-lkp@intel.com/ --- arch/x86/kernel/head64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54f9a8faf212..22c9ba305ac1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -186,7 +186,7 @@ unsigned long __head __startup_64(unsigned long p2v_offset, pgd = &RIP_REL_REF(early_top_pgt)->pgd; pgd[pgd_index(__START_KERNEL_map)] += load_delta; - if (la57) { + if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); p4d[MAX_PTRS_PER_P4D - 1] += load_delta; From 7d5265ffcd8b41da5e09066360540d6e0716e9cd Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 12 Nov 2024 10:28:26 -0500 Subject: [PATCH 112/224] rseq: Validate read-only fields under DEBUG_RSEQ config The rseq uapi requires cooperation between users of the rseq fields to ensure that all libraries and applications using rseq within a process do not interfere with each other. This is especially important for fields which are meant to be read-only from user-space, as documented in uapi/linux/rseq.h: - cpu_id_start, - cpu_id, - node_id, - mm_cid. Storing to those fields from a user-space library prevents any sharing of the rseq ABI with other libraries and applications, as other users are not aware that the content of those fields has been altered by a third-party library. This is unfortunately the current behavior of tcmalloc: it purposefully overlaps part of a cached value with the cpu_id_start upper bits to get notified about preemption, because the kernel clears those upper bits before returning to user-space. This behavior does not conform to the rseq uapi header ABI. This prevents tcmalloc from using rseq when rseq is registered by the GNU C library 2.35+. It requires tcmalloc users to disable glibc rseq registration with a glibc tunable, which is a sad state of affairs. Considering that tcmalloc and the GNU C library are the two first upstream projects using rseq, and that they are already incompatible due to use of this hack, adding kernel-level validation of all read-only fields content is necessary to ensure future users of rseq abide by the rseq ABI requirements. Validate that user-space does not corrupt the read-only fields and conform to the rseq uapi header ABI when the kernel is built with CONFIG_DEBUG_RSEQ=y. This is done by storing a copy of the read-only fields in the task_struct, and validating the prior values present in user-space before updating them. If the values do not match, print a warning on the console (printk_ratelimited()). This is a first step to identify misuses of the rseq ABI by printing a warning on the console. After a giving some time to userspace to correct its use of rseq, the plan is to eventually terminate offending processes with SIGSEGV. This change is expected to produce warnings for the upstream tcmalloc implementation, but tcmalloc developers mentioned they were open to adapt their implementation to kernel-level change. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://github.com/google/tcmalloc/issues/144 --- include/linux/sched.h | 9 ++++ kernel/rseq.c | 98 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2ef..b5916be49f62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1367,6 +1367,15 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; +# ifdef CONFIG_DEBUG_RSEQ + /* + * This is a place holder to save a copy of the rseq fields for + * validation of read-only fields. The struct rseq has a + * variable-length array at the end, so it cannot be used + * directly. Reserve a size large enough for the known fields. + */ + char rseq_fields[sizeof(struct rseq)]; +# endif #endif #ifdef CONFIG_SCHED_MM_CID diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..e04bb30a2eb8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -25,6 +26,78 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_DEBUG_RSEQ +static struct rseq *rseq_kernel_fields(struct task_struct *t) +{ + return (struct rseq *) t->rseq_fields; +} + +static int rseq_validate_ro_fields(struct task_struct *t) +{ + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + u32 cpu_id_start, cpu_id, node_id, mm_cid; + struct rseq __user *rseq = t->rseq; + + /* + * Validate fields which are required to be read-only by + * user-space. + */ + if (!user_read_access_begin(rseq, t->rseq_len)) + goto efault; + unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); + unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_get_user(node_id, &rseq->node_id, efault_end); + unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); + user_read_access_end(); + + if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || + cpu_id != rseq_kernel_fields(t)->cpu_id || + node_id != rseq_kernel_fields(t)->node_id || + mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { + + pr_warn("Detected rseq corruption for pid: %d, name: %s\n" + "\tcpu_id_start: %u ?= %u\n" + "\tcpu_id: %u ?= %u\n" + "\tnode_id: %u ?= %u\n" + "\tmm_cid: %u ?= %u\n", + t->pid, t->comm, + cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, + cpu_id, rseq_kernel_fields(t)->cpu_id, + node_id, rseq_kernel_fields(t)->node_id, + mm_cid, rseq_kernel_fields(t)->mm_cid); + } + + /* For now, only print a console warning on mismatch. */ + return 0; + +efault_end: + user_read_access_end(); +efault: + return -EFAULT; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ + rseq_kernel_fields(t)->cpu_id_start = cpu_id; + rseq_kernel_fields(t)->cpu_id = cpu_id; + rseq_kernel_fields(t)->node_id = node_id; + rseq_kernel_fields(t)->mm_cid = mm_cid; +} +#else +static int rseq_validate_ro_fields(struct task_struct *t) +{ + return 0; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ +} +#endif + /* * * Restartable sequences are a lightweight interface that allows @@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); + /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + goto efault; WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; @@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); + rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -119,6 +198,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; + /* + * Validate read-only rseq fields. + */ + if (!rseq_validate_ro_fields(t)) + return -EFAULT; /* * Reset cpu_id_start to its initial state (0). */ @@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; + + rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if @@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; +#ifdef CONFIG_DEBUG_RSEQ + /* + * Initialize the in-kernel rseq fields copy for validation of + * read-only fields. + */ + if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || + get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || + get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || + get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) + return -EFAULT; +#endif /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields From 7675361ff9a1d9038025c05267600d0c762c0236 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 5 Dec 2024 15:59:35 -0800 Subject: [PATCH 113/224] sched: deadline: Cleanup goto label in pick_earliest_pushable_dl_task Commit 8b5e770ed7c0 ("sched/deadline: Optimize pull_dl_task()") added a goto label seems would be better written as a while loop. So replace the goto with a while loop, to make it easier to read. Reported-by: Todd Kjos Signed-off-by: John Stultz Reviewed-and-tested-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lore.kernel.org/r/20241206000009.1226085-1-jstultz@google.com --- kernel/sched/deadline.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 33b4646f8b24..643d101cb96a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2516,16 +2516,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; From 2c272fadb58b590eb973c6c447b039f10631f5f7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:44 -0800 Subject: [PATCH 114/224] x86/resctrl: Compute memory bandwidth for all supported events Switching between local and total memory bandwidth events as the input to the mba_sc feedback loop would be cumbersome and take effect slowly in the current implementation as the bandwidth is only known after two consecutive readings of the same event. Compute the bandwidth for all supported events. This doesn't add significant overhead and will make changing which event is used simple. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-5-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 72 ++++++++++++--------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index adb18f088979..94a1d9780461 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct mbm_state *m = &rr->d->mbm_local[idx]; u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; cur_bytes = rr->val; bytes = cur_bytes - m->prev_bw_bytes; @@ -815,54 +818,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* - * This is protected from concurrent reads from user - * as both the user and we hold the global mutex. + * If the software controller is enabled, compute the + * bandwidth for this event id. */ - if (is_mbm_total_enabled()) { - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); - __mon_event_count(closid, rmid, &rr); + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } - if (is_mbm_local_enabled()) { - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - __mon_event_count(closid, rmid, &rr); - - /* - * Call the MBA software controller only for the - * control groups and when user has enabled - * the software controller explicitly. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } + if (is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } /* From 141cb5c482b38d7e494a207f881d0fe61e4848ef Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:45 -0800 Subject: [PATCH 115/224] x86/resctrl: Make mba_sc use total bandwidth if local is not supported The default input measurement to the mba_sc feedback loop for memory bandwidth control when the user mounts with the "mba_MBps" option is the local bandwidth event. But some systems may not support a local bandwidth event. When local bandwidth event is not supported, check for support of total bandwidth and use that instead. Relax the mount option check to allow use of the "mba_MBps" option for systems when only total bandwidth monitoring is supported. Also update the error message. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-6-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 2 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 94bf559966d6..3d1735ed8d1f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -965,6 +965,8 @@ static __init bool get_rdt_mon_resources(void) if (is_mbm_local_enabled()) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; return !rdt_get_mon_l3_config(r); } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 8a52b25ce26b..0659b8e2a71b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2341,7 +2341,7 @@ static bool supports_mba_mbps(void) struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - return (is_mbm_local_enabled() && + return (is_mbm_enabled() && r->alloc_capable && is_mba_linear() && r->ctrl_scope == rmbm->mon_scope); } @@ -2768,7 +2768,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: - msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope"; + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; if (!supports_mba_mbps()) return invalfc(fc, msg); ctx->enable_mba_mbps = true; From 4bf610499c429fa0bfb3fa94be450f01016224c5 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 31 Oct 2024 11:34:01 +0100 Subject: [PATCH 116/224] x86/cpufeature: Document cpu_feature_enabled() as the default to use cpu_feature_enabled() should be used in most cases when CPU feature support needs to be tested in code. Document that. Reported-by: Sohil Mehta Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241031103401.GBZyNdGQ-ZyXKyzC_z@fat_crate.local --- arch/x86/include/asm/cpufeature.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0b9611da6c53..de1ad09fe8d7 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -132,11 +132,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* - * This macro is for detection of features which need kernel - * infrastructure to be used. It may *not* directly test the CPU - * itself. Use the cpu_has() family if you want true runtime - * testing of CPU features, like in hypervisor code where you are - * supporting a possible guest feature where host support for it + * This is the default CPU features testing macro to use in code. + * + * It is for detection of features which need kernel infrastructure to be + * used. It may *not* directly test the CPU itself. Use the cpu_has() family + * if you want true runtime testing of CPU features, like in hypervisor code + * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ @@ -161,13 +162,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* - * Static testing of CPU features. Used the same as boot_cpu_has(). It - * statically patches the target code for additional performance. Use - * static_cpu_has() only in fast paths, where every cycle counts. Which - * means that the boot_cpu_has() variant is already fast enough for the - * majority of cases and you should stick to using it as it is generally - * only two instructions: a RIP-relative MOV and a TEST. - * * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling From ab0e7f20768af59fe161d71cc5d1de384f2a9da8 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 2 Dec 2024 20:00:10 +0100 Subject: [PATCH 117/224] Documentation: Merge x86-specific boot options doc into kernel-parameters.txt Documentation/arch/x86/x86_64/boot-options.rst is causing unnecessary confusion by being a second place where one can put x86 boot options. Move them into the main one. Drop removed ones like "acpi=ht", while at it. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20241202190011.11979-1-bp@kernel.org --- .../admin-guide/kernel-parameters.rst | 3 - .../admin-guide/kernel-parameters.txt | 237 ++++++++++++- .../arch/x86/x86_64/boot-options.rst | 312 ------------------ .../arch/x86/x86_64/fake-numa-for-cpusets.rst | 2 +- Documentation/arch/x86/x86_64/index.rst | 1 - arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/pci-dma.c | 4 - 7 files changed, 227 insertions(+), 334 deletions(-) delete mode 100644 Documentation/arch/x86/x86_64/boot-options.rst diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 59931f21c974..39d0e7ff0965 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -194,8 +194,6 @@ is applicable:: WDT Watchdog support is enabled. X86-32 X86-32, aka i386 architecture is enabled. X86-64 X86-64 architecture is enabled. - More X86-64 boot options can be found in - Documentation/arch/x86/x86_64/boot-options.rst. X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) X86_UV SGI UV support is enabled. XEN Xen support is enabled @@ -213,7 +211,6 @@ Do not modify the syntax of boot loader parameters without extreme need or coordination with . There are also arch-specific kernel-parameters not documented here. -See for example . Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index dc663c0ca670..bf7b3568d1a5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -21,6 +21,10 @@ strictly ACPI specification compliant. rsdt -- prefer RSDT over (default) XSDT copy_dsdt -- copy DSDT to memory + nocmcff -- Disable firmware first mode for corrected + errors. This disables parsing the HEST CMC error + source to check if firmware has set the FF flag. This + may result in duplicate corrected error reports. nospcr -- disable console in ACPI SPCR table as default _serial_ console on ARM64 For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or @@ -405,6 +409,8 @@ not play well with APC CPU idle - disable it if you have APC and your system crashes randomly. + apic [APIC,X86-64] Use IO-APIC. Default. + apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller Change the output verbosity while booting Format: { quiet (default) | verbose | debug } @@ -424,6 +430,10 @@ useful so that a dump capture kernel won't be shot down by NMI + apicpmtimer Do APIC timer calibration using the pmtimer. Implies + apicmaintimer. Useful when your PIT timer is totally + broken. + autoconf= [IPV6] See Documentation/networking/ipv6.rst. @@ -1726,6 +1736,8 @@ off: Disable GDS mitigation. + gbpages [X86] Use GB pages for kernel direct mappings. + gcov_persist= [GCOV] When non-zero (default), profiling data for kernel modules is saved and remains accessible via debugfs, even when the module is unloaded/reloaded. @@ -2008,12 +2020,21 @@ idle= [X86,EARLY] Format: idle=poll, idle=halt, idle=nomwait - Poll forces a polling idle loop that can slightly - improve the performance of waking up a idle CPU, but - will use a lot of power and make the system run hot. - Not recommended. + + idle=poll: Don't do power saving in the idle loop + using HLT, but poll for rescheduling event. This will + make the CPUs eat a lot more power, but may be useful + to get slightly better performance in multiprocessor + benchmarks. It also makes some profiling using + performance counters more accurate. Please note that + on systems with MONITOR/MWAIT support (like Intel + EM64T CPUs) this option has no performance advantage + over the normal idle loop. It may also interact badly + with hyperthreading. + idle=halt: Halt is forced to be used for CPU idle. In such case C2/C3 won't be used again. + idle=nomwait: Disable mwait for CPU C-states idxd.sva= [HW] @@ -2311,20 +2332,73 @@ relaxed iommu= [X86,EARLY] + off + Don't initialize and use any kind of IOMMU. + force + Force the use of the hardware IOMMU even when + it is not actually needed (e.g. because < 3 GB + memory). + noforce + Don't force hardware IOMMU usage when it is not + needed. (default). + biomerge panic nopanic merge nomerge + soft - pt [X86] - nopt [X86] - nobypass [PPC/POWERNV] + Use software bounce buffering (SWIOTLB) (default for + Intel machines). This can be used to prevent the usage + of an available hardware IOMMU. + + [X86] + pt + [X86] + nopt + [PPC/POWERNV] + nobypass Disable IOMMU bypass, using IOMMU for PCI devices. + [X86] + AMD Gart HW IOMMU-specific options: + + + Set the size of the remapping area in bytes. + + allowed + Overwrite iommu off workarounds for specific chipsets + + fullflush + Flush IOMMU on each allocation (default). + + nofullflush + Don't use IOMMU fullflush. + + memaper[=] + Allocate an own aperture over RAM with size + 32MB< series board detected. + Selecting for reboots." In the case where you + think the quirk is in error (e.g. you have newer BIOS, + or newer board) using this option will ignore the + built-in quirk table, and use the generic default + reboot actions. + + efi + Use efi reset_system runtime service. If EFI is not + configured or the EFI reset does not work, the reboot + path attempts the reset using the keyboard controller. + + force + Don't stop other CPUs on reboot. This can make reboot + more reliable in some cases. + + kbd + Use the keyboard controller. cold reset (default) + + pci + Use a write to the PCI config space register 0xcf9 to + trigger reboot. + + triple + Force a triple fault (init) + + warm + Don't set the cold reboot flag + + Using warm reset will be much faster especially on big + memory systems because the BIOS will not go through + the memory check. Disadvantage is that not all + hardware will be completely reinitialized on reboot so + there may be boot problems on some systems. + + refscale.holdoff= [KNL] Set test-start holdoff period. The purpose of this parameter is to delay the start of the @@ -6101,7 +6305,16 @@ serialnumber [BUGS=X86-32] - sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst + sev=option[,option...] [X86-64] + + debug + Enable debug messages. + + nosnp + Do not enable SEV-SNP (applies to host/hypervisor + only). Setting 'nosnp' avoids the RMP check overhead + in memory accesses when users do not want to run + SEV-SNP guests. shapers= [NET] Maximal number of shapers. diff --git a/Documentation/arch/x86/x86_64/boot-options.rst b/Documentation/arch/x86/x86_64/boot-options.rst deleted file mode 100644 index d69e3cfbdba5..000000000000 --- a/Documentation/arch/x86/x86_64/boot-options.rst +++ /dev/null @@ -1,312 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========================== -AMD64 Specific Boot Options -=========================== - -There are many others (usually documented in driver documentation), but -only the AMD64 specific ones are listed here. - -Machine check -============= -Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables. - - mce=off - Disable machine check - mce=no_cmci - Disable CMCI(Corrected Machine Check Interrupt) that - Intel processor supports. Usually this disablement is - not recommended, but it might be handy if your hardware - is misbehaving. - Note that you'll get more problems without CMCI than with - due to the shared banks, i.e. you might get duplicated - error logs. - mce=dont_log_ce - Don't make logs for corrected errors. All events reported - as corrected are silently cleared by OS. - This option will be useful if you have no interest in any - of corrected errors. - mce=ignore_ce - Disable features for corrected errors, e.g. polling timer - and CMCI. All events reported as corrected are not cleared - by OS and remained in its error banks. - Usually this disablement is not recommended, however if - there is an agent checking/clearing corrected errors - (e.g. BIOS or hardware monitoring applications), conflicting - with OS's error handling, and you cannot deactivate the agent, - then this option will be a help. - mce=no_lmce - Do not opt-in to Local MCE delivery. Use legacy method - to broadcast MCEs. - mce=bootlog - Enable logging of machine checks left over from booting. - Disabled by default on AMD Fam10h and older because some BIOS - leave bogus ones. - If your BIOS doesn't do that it's a good idea to enable though - to make sure you log even machine check events that result - in a reboot. On Intel systems it is enabled by default. - mce=nobootlog - Disable boot machine check logging. - mce=monarchtimeout (number) - monarchtimeout: - Sets the time in us to wait for other CPUs on machine checks. 0 - to disable. - mce=bios_cmci_threshold - Don't overwrite the bios-set CMCI threshold. This boot option - prevents Linux from overwriting the CMCI threshold set by the - bios. Without this option, Linux always sets the CMCI - threshold to 1. Enabling this may make memory predictive failure - analysis less effective if the bios sets thresholds for memory - errors since we will not see details for all errors. - mce=recovery - Force-enable recoverable machine check code paths - - nomce (for compatibility with i386) - same as mce=off - - Everything else is in sysfs now. - -APICs -===== - - apic - Use IO-APIC. Default - - noapic - Don't use the IO-APIC. - - disableapic - Don't use the local APIC - - nolapic - Don't use the local APIC (alias for i386 compatibility) - - pirq=... - See Documentation/arch/x86/i386/IO-APIC.rst - - noapictimer - Don't set up the APIC timer - - no_timer_check - Don't check the IO-APIC timer. This can work around - problems with incorrect timer initialization on some boards. - - apicpmtimer - Do APIC timer calibration using the pmtimer. Implies - apicmaintimer. Useful when your PIT timer is totally broken. - -Timing -====== - - notsc - Deprecated, use tsc=unstable instead. - - nohpet - Don't use the HPET timer. - -Idle loop -========= - - idle=poll - Don't do power saving in the idle loop using HLT, but poll for rescheduling - event. This will make the CPUs eat a lot more power, but may be useful - to get slightly better performance in multiprocessor benchmarks. It also - makes some profiling using performance counters more accurate. - Please note that on systems with MONITOR/MWAIT support (like Intel EM64T - CPUs) this option has no performance advantage over the normal idle loop. - It may also interact badly with hyperthreading. - -Rebooting -========= - - reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old] - bios - Use the CPU reboot vector for warm reset - warm - Don't set the cold reboot flag - cold - Set the cold reboot flag - triple - Force a triple fault (init) - kbd - Use the keyboard controller. cold reset (default) - acpi - Use the ACPI RESET_REG in the FADT. If ACPI is not configured or - the ACPI reset does not work, the reboot path attempts the reset - using the keyboard controller. - efi - Use efi reset_system runtime service. If EFI is not configured or - the EFI reset does not work, the reboot path attempts the reset using - the keyboard controller. - pci - Use a write to the PCI config space register 0xcf9 to trigger reboot. - - Using warm reset will be much faster especially on big memory - systems because the BIOS will not go through the memory check. - Disadvantage is that not all hardware will be completely reinitialized - on reboot so there may be boot problems on some systems. - - reboot=force - Don't stop other CPUs on reboot. This can make reboot more reliable - in some cases. - - reboot=default - There are some built-in platform specific "quirks" - you may see: - "reboot: series board detected. Selecting for reboots." - In the case where you think the quirk is in error (e.g. you have - newer BIOS, or newer board) using this option will ignore the built-in - quirk table, and use the generic default reboot actions. - -NUMA -==== - - numa=off - Only set up a single NUMA node spanning all memory. - - numa=noacpi - Don't parse the SRAT table for NUMA setup - - numa=nohmat - Don't parse the HMAT table for NUMA setup, or soft-reserved memory - partitioning. - -ACPI -==== - - acpi=off - Don't enable ACPI - acpi=ht - Use ACPI boot table parsing, but don't enable ACPI interpreter - acpi=force - Force ACPI on (currently not needed) - acpi=strict - Disable out of spec ACPI workarounds. - acpi_sci={edge,level,high,low} - Set up ACPI SCI interrupt. - acpi=noirq - Don't route interrupts - acpi=nocmcff - Disable firmware first mode for corrected errors. This - disables parsing the HEST CMC error source to check if - firmware has set the FF flag. This may result in - duplicate corrected error reports. - -PCI -=== - - pci=off - Don't use PCI - pci=conf1 - Use conf1 access. - pci=conf2 - Use conf2 access. - pci=rom - Assign ROMs. - pci=assign-busses - Assign busses - pci=irqmask=MASK - Set PCI interrupt mask to MASK - pci=lastbus=NUMBER - Scan up to NUMBER busses, no matter what the mptable says. - pci=noacpi - Don't use ACPI to set up PCI interrupt routing. - -IOMMU (input/output memory management unit) -=========================================== -Multiple x86-64 PCI-DMA mapping implementations exist, for example: - - 1. : use no hardware/software IOMMU at all - (e.g. because you have < 3 GB memory). - Kernel boot message: "PCI-DMA: Disabling IOMMU" - - 2. : AMD GART based hardware IOMMU. - Kernel boot message: "PCI-DMA: using GART IOMMU" - - 3. : Software IOMMU implementation. Used - e.g. if there is no hardware IOMMU in the system and it is need because - you have >3GB memory or told the kernel to us it (iommu=soft)) - Kernel boot message: "PCI-DMA: Using software bounce buffering - for IO (SWIOTLB)" - -:: - - iommu=[][,noagp][,off][,force][,noforce] - [,memaper[=]][,merge][,fullflush][,nomerge] - [,noaperture] - -General iommu options: - - off - Don't initialize and use any kind of IOMMU. - noforce - Don't force hardware IOMMU usage when it is not needed. (default). - force - Force the use of the hardware IOMMU even when it is - not actually needed (e.g. because < 3 GB memory). - soft - Use software bounce buffering (SWIOTLB) (default for - Intel machines). This can be used to prevent the usage - of an available hardware IOMMU. - -iommu options only relevant to the AMD GART hardware IOMMU: - - - Set the size of the remapping area in bytes. - allowed - Overwrite iommu off workarounds for specific chipsets. - fullflush - Flush IOMMU on each allocation (default). - nofullflush - Don't use IOMMU fullflush. - memaper[=] - Allocate an own aperture over RAM with size 32MB<[,force,noforce] - - Prereserve that many 2K slots for the software IO bounce buffering. - force - Force all IO through the software TLB. - noforce - Do not initialize the software TLB. - - -Miscellaneous -============= - - nogbpages - Do not use GB pages for kernel direct mappings. - gbpages - Use GB pages for kernel direct mappings. - - -AMD SEV (Secure Encrypted Virtualization) -========================================= -Options relating to AMD SEV, specified via the following format: - -:: - - sev=option1[,option2] - -The available options are: - - debug - Enable debug messages. - - nosnp - Do not enable SEV-SNP (applies to host/hypervisor only). Setting - 'nosnp' avoids the RMP check overhead in memory accesses when - users do not want to run SEV-SNP guests. diff --git a/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst index ba74617d4999..970ee94eb551 100644 --- a/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst @@ -18,7 +18,7 @@ For more information on the features of cpusets, see Documentation/admin-guide/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of -configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst. +configuring fake nodes, see Documentation/admin-guide/kernel-parameters.txt For the purposes of this introduction, we'll assume a very primitive NUMA emulation setup of "numa=fake=4*512,". This will split our system memory into diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index ad15e9bd623f..a0261957a08a 100644 --- a/Documentation/arch/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst @@ -7,7 +7,6 @@ x86_64 Support .. toctree:: :maxdepth: 2 - boot-options uefi mm 5level-paging diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 74777a97e394..1eb4d23cdaae 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -97,7 +97,7 @@ config IOMMU_DEBUG code. When you use it make sure you have a big enough IOMMU/AGP aperture. Most of the options enabled by this can be set more finegrained using the iommu= command line - options. See Documentation/arch/x86/x86_64/boot-options.rst for more + options. See Documentation/admin-guide/kernel-parameters.txt for more details. config IOMMU_LEAK diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f323d83e40a7..6267363e0189 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void) swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } -/* - * See for the iommu kernel - * parameter documentation. - */ static __init int iommu_setup(char *p) { iommu_merge = 1; From 13148e22c151e871c1c00bab519f39cc6f6ea37a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 2 Dec 2024 20:00:11 +0100 Subject: [PATCH 118/224] x86/apic: Remove "disablelapic" cmdline option The convention is "no" and there already is "nolapic". Drop the disable one. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20241202190011.11979-2-bp@kernel.org --- arch/x86/kernel/apic/apic.c | 9 +-------- arch/x86/kernel/cpu/topology.c | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c5fb28e6451a..1267b2691abc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2582,19 +2582,12 @@ int apic_is_clustered_box(void) /* * APIC command line parameters */ -static int __init setup_disableapic(char *arg) +static int __init setup_nolapic(char *arg) { apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 621a151ccf7d..6ebed852a9ef 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -428,7 +428,7 @@ void __init topology_apply_cmdline_limits_early(void) { unsigned int possible = nr_cpu_ids; - /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ + /* 'maxcpus=0' 'nosmp' 'nolapic' 'noapic' */ if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) possible = 1; From 3560a023a9b9965803e8a967ee88343879b5dc1b Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Wed, 30 Oct 2024 12:28:04 +0530 Subject: [PATCH 119/224] x86/cpu: Fix typo in x86_match_cpu()'s doc Fix typo in x86_match_cpu()'s description. [ bp: Massage commit message. ] Signed-off-by: Raag Jadav Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241030065804.407793-1-raag.jadav@intel.com --- arch/x86/kernel/cpu/match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 8e7de733320a..82e5d29a04e2 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -6,7 +6,7 @@ #include /** - * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * x86_match_cpu - match current CPU against an array of x86_cpu_ids * @match: Pointer to array of x86_cpu_ids. Last entry terminated with * {}. * From bad6722e478f5b17a5ceb039dfb4c680cf2c0b48 Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Wed, 4 Dec 2024 14:20:02 +0000 Subject: [PATCH 120/224] kexec: Consolidate machine_kexec_mask_interrupts() implementation Consolidate the machine_kexec_mask_interrupts implementation into a common function located in a new file: kernel/irq/kexec.c. This removes duplicate implementations from architecture-specific files in arch/arm, arch/arm64, arch/powerpc, and arch/riscv, reducing code duplication and improving maintainability. The new implementation retains architecture-specific behavior for CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD, which was previously implemented for ARM64. When enabled (currently for ARM64), it clears the active state of interrupts forwarded to virtual machines (VMs) before handling other interrupt masking operations. Signed-off-by: Eliav Farber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241204142003.32859-2-farbere@amazon.com --- arch/arm/kernel/machine_kexec.c | 23 ------------------ arch/arm64/Kconfig | 1 + arch/arm64/kernel/machine_kexec.c | 31 ------------------------ arch/powerpc/include/asm/kexec.h | 1 - arch/powerpc/kexec/core.c | 22 ----------------- arch/powerpc/kexec/core_32.c | 1 + arch/riscv/kernel/machine_kexec.c | 23 ------------------ include/linux/irq.h | 3 +++ kernel/irq/Kconfig | 6 +++++ kernel/irq/Makefile | 2 +- kernel/irq/kexec.c | 40 +++++++++++++++++++++++++++++++ 11 files changed, 52 insertions(+), 101 deletions(-) create mode 100644 kernel/irq/kexec.c diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 80ceb5bd2680..dd430477e7c1 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -127,29 +127,6 @@ void crash_smp_send_stop(void) cpus_stopped = 1; } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - void machine_crash_shutdown(struct pt_regs *regs) { local_irq_disable(); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 100570a048c5..dcc3551cf6c2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -149,6 +149,7 @@ config ARM64 select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP select GENERIC_IRQ_IPI + select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 82e2203d86a3..6f121a0164a4 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage) BUG(); /* Should never get here. */ } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - int ret; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /** * machine_crash_shutdown - shutdown non-crashing cpus and save registers */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 270ee93a0f7d..601e569303e1 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -61,7 +61,6 @@ struct pt_regs; extern void kexec_smp_wait(void); /* get and clear naca physid, wait for master to copy new code to 0 */ extern void default_machine_kexec(struct kimage *image); -extern void machine_kexec_mask_interrupts(void); void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer, unsigned long start_address) __noreturn; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index b8333a49ea5d..58a930a47422 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -22,28 +22,6 @@ #include #include -void machine_kexec_mask_interrupts(void) { - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - #ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { diff --git a/arch/powerpc/kexec/core_32.c b/arch/powerpc/kexec/core_32.c index c95f96850c9e..deb28eb44f30 100644 --- a/arch/powerpc/kexec/core_32.c +++ b/arch/powerpc/kexec/core_32.c @@ -7,6 +7,7 @@ * Copyright (C) 2005 IBM Corporation. */ +#include #include #include #include diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index 3c830a6f7ef4..2306ce3e5f22 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -114,29 +114,6 @@ void machine_shutdown(void) #endif } -static void machine_kexec_mask_interrupts(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - /* * machine_crash_shutdown - Prepare to kexec after a kernel crash * diff --git a/include/linux/irq.h b/include/linux/irq.h index fa711f80957b..25f51bf3c351 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -694,6 +694,9 @@ extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif +/* Disable or mask interrupts during a kernel kexec */ +extern void machine_kexec_mask_interrupts(void); + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 529adb1f5859..875f25ed6f71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -141,6 +141,12 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. +# Clear forwarded VM interrupts during kexec. +# This option ensures the kernel clears active states for interrupts +# forwarded to virtual machines (VMs) during a machine kexec. +config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD + bool + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index f19d3080bf11..c0f44c06d69d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) CFLAGS_timings.o += -DDEBUG diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c new file mode 100644 index 000000000000..0f9548c1708d --- /dev/null +++ b/kernel/irq/kexec.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include "internals.h" + +void machine_kexec_mask_interrupts(void) +{ + struct irq_desc *desc; + unsigned int i; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + int check_eoi = 1; + + chip = irq_desc_get_chip(desc); + if (!chip) + continue; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) { + /* + * First try to remove the active state from an interrupt which is forwarded + * to a VM. If the interrupt is not forwarded, try to EOI the interrupt. + */ + check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); + } + + if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_mask) + chip->irq_mask(&desc->irq_data); + + if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) + chip->irq_disable(&desc->irq_data); + } +} From b4706d814921cc2df7bb59ad8f9ee84855a4f0c4 Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Wed, 4 Dec 2024 14:20:03 +0000 Subject: [PATCH 121/224] genirq/kexec: Prevent redundant IRQ masking by checking state before shutdown During machine kexec, machine_kexec_mask_interrupts() is responsible for disabling or masking all interrupts. While the irq_disable() is only invoked when the interrupt is not yet disabled, it unconditionally invokes the irq_mask() callback for every interrupt descriptor, even when the interrupt is already masked or not even started up yet. A specific issue was observed in the crash kernel flow after unbinding a device (prior to kexec) that used a GPIO as an IRQ source. The warning was triggered by the gpiochip_disable_irq() function, which attempts to clear the FLAG_IRQ_IS_ENABLED flag when FLAG_USED_AS_IRQ was not set. This issue surfaced after commit a8173820f441 ("gpio: gpiolib: Allow GPIO IRQs to lazy disable") introduced lazy disablement for GPIO IRQs. It replaced disable/enable hooks with mask/unmask hooks. Unlike the disable hook, the mask hook doesn't handle already-masked IRQs. When a GPIO-IRQ driver is unbound, the IRQ is released, triggering __irq_disable() and irq_state_set_masked(). A subsequent call to machine_kexec_mask_interrupts() re-invokes chip->irq_mask(). This results in a call chain, including gpiochip_irq_mask() and gpiochip_disable_irq(). Since FLAG_USED_AS_IRQ was cleared earlier, the warning is triggered. Replace the direct invocation of the irq_mask() and irq_disable() callbacks invoking to irq_shutdown(), which handles the cases correct and avoid it all together when the interrupt has never been started up. Signed-off-by: Eliav Farber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241204142003.32859-3-farbere@amazon.com --- kernel/irq/kexec.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c index 0f9548c1708d..1a3deffe6b5b 100644 --- a/kernel/irq/kexec.c +++ b/kernel/irq/kexec.c @@ -17,7 +17,7 @@ void machine_kexec_mask_interrupts(void) int check_eoi = 1; chip = irq_desc_get_chip(desc); - if (!chip) + if (!chip || !irqd_is_started(&desc->irq_data)) continue; if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) { @@ -31,10 +31,6 @@ void machine_kexec_mask_interrupts(void) if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) chip->irq_eoi(&desc->irq_data); - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); + irq_shutdown(desc); } } From f5cd0e316f14d79c9eb0cf8fe7e60cee3a657aa8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:46 -0800 Subject: [PATCH 122/224] x86/resctrl: Add "mba_MBps_event" file to CTRL_MON directories The "mba_MBps" mount option provides an alternate method to control memory bandwidth. Instead of specifying allowable bandwidth as a percentage of maximum possible, the user provides a MiB/s limit value. In preparation to allow the user to pick the memory bandwidth monitoring event used as input to the feedback loop, provide a file in each CTRL_MON group directory that shows the event currently in use. Note that this file is only visible when the "mba_MBps" mount option is in use. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-7-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 30 +++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 10 ++++++++ 3 files changed, 42 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 200d89a64027..5fa37b4ecc7a 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -518,6 +518,36 @@ static int smp_mon_event_count(void *arg) return 0; } +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + switch (rdtgrp->mba_mbps_event) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + seq_puts(s, "mbm_local_bytes\n"); + break; + case QOS_L3_MBM_TOTAL_EVENT_ID: + seq_puts(s, "mbm_total_bytes\n"); + break; + default: + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); + ret = -EINVAL; + break; + } + } else { + ret = -ENOENT; + } + + rdtgroup_kn_unlock(of->kn); + + return ret; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 542d01c055aa..1bd61edbaf12 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -610,6 +610,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, bool exclusive); unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 0659b8e2a71b..6eb930b8bdfd 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1950,6 +1950,12 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_schemata_show, .fflags = RFTYPE_CTRL_BASE, }, + { + .name = "mba_MBps_event", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_mba_mbps_event_show, + }, { .name = "mode", .mode = 0644, @@ -2355,6 +2361,7 @@ static int set_mba_sc(bool mba_sc) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_ctrl_domain *d; + unsigned long fflags; int i; if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) @@ -2369,6 +2376,9 @@ static int set_mba_sc(bool mba_sc) d->mbps_val[i] = MBA_MAX_MBPS; } + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; + resctrl_file_fflags_init("mba_MBps_event", fflags); + return 0; } From 8e931105acae688ff0fc8f875a6c05e5aed8ab79 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:47 -0800 Subject: [PATCH 123/224] x86/resctrl: Add write option to "mba_MBps_event" file The "mba_MBps" mount option provides an alternate method to control memory bandwidth. Instead of specifying allowable bandwidth as a percentage of maximum possible, the user provides a MiB/s limit value. There is a file in each CTRL_MON group directory that shows the event currently in use. Allow writing that file to choose a different event. A user can choose any of the memory bandwidth monitoring events listed in /sys/fs/resctrl/info/L3_mon/mon_features independently for each CTRL_MON group by writing to each of the "mba_MBps_event" files. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-8-tony.luck@intel.com --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 40 +++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 2 ++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 3 +- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 5fa37b4ecc7a..536351159cc2 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -518,6 +518,46 @@ static int smp_mon_event_count(void *arg) return 0; } +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!strcmp(buf, "mbm_local_bytes")) { + if (is_mbm_local_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else + ret = -EINVAL; + } else if (!strcmp(buf, "mbm_total_bytes")) { + if (is_mbm_total_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (ret) + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1bd61edbaf12..20c898f09b7e 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -610,6 +610,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, struct seq_file *s, void *v); bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6eb930b8bdfd..6419e04d8a7b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1952,8 +1952,9 @@ static struct rftype res_common_files[] = { }, { .name = "mba_MBps_event", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mba_mbps_event_write, .seq_show = rdtgroup_mba_mbps_event_show, }, { From faf6ef673787956ec4d33ac8bf56f8ea929abf37 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 Dec 2024 08:31:48 -0800 Subject: [PATCH 124/224] x86/resctrl: Document the new "mba_MBps_event" file Add a section to document a new read/write file that shows/sets the memory bandwidth event used to control bandwidth used by each CTRL_MON group. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Babu Moger Link: https://lore.kernel.org/r/20241206163148.83828-9-tony.luck@intel.com --- Documentation/arch/x86/resctrl.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index a824affd741d..6768fc1fad16 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -384,6 +384,16 @@ When monitoring is enabled all MON groups will also contain: Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. +When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: + +"mba_MBps_event": + Reading this file shows which memory bandwidth event is used + as input to the software feedback loop that keeps memory bandwidth + below the value specified in the schemata file. Writing the + name of one of the supported memory bandwidth events found in + /sys/fs/resctrl/info/L3_MON/mon_features changes the input + event. + Resource allocation rules ------------------------- From 3e43c60eb3e3779e88635d45400f7387ec732c07 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:46 -0600 Subject: [PATCH 125/224] x86/sev: Prepare for using the RMPREAD instruction to access the RMP The RMPREAD instruction returns an architecture defined format of an RMP entry. This is the preferred method for examining RMP entries. In preparation for using the RMPREAD instruction, convert the existing code that directly accesses the RMP to map the raw RMP information into the architecture defined format. RMPREAD output returns a status bit for the 2MB region status. If the input page address is 2MB aligned and any other pages within the 2MB region are assigned, then 2MB region status will be set to 1. Otherwise, the 2MB region status will be set to 0. For systems that do not support RMPREAD, calculating this value would require looping over all of the RMP table entries within that range until one is found with the assigned bit set. Since this bit is not defined in the current format, and so not used today, do not incur the overhead associated with calculating it. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/da49d5af1eb7f9039f35f14a32ca091efb2dd818.1733172653.git.thomas.lendacky@amd.com --- arch/x86/virt/svm/sev.c | 144 ++++++++++++++++++++++++++++------------ 1 file changed, 100 insertions(+), 44 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 9a6a943d8e41..cf64e9384ea0 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -31,10 +31,29 @@ #include /* - * The RMP entry format is not architectural. The format is defined in PPR - * Family 19h Model 01h, Rev B1 processor. + * The RMP entry information as returned by the RMPREAD instruction. */ struct rmpentry { + u64 gpa; + u8 assigned :1, + rsvd1 :7; + u8 pagesize :1, + hpage_region_status :1, + rsvd2 :6; + u8 immutable :1, + rsvd3 :7; + u8 rsvd4; + u32 asid; +} __packed; + +/* + * The raw RMP entry format is not architectural. The format is defined in PPR + * Family 19h Model 01h, Rev B1 processor. This format represents the actual + * entry in the RMP table memory. The bitfield definitions are used for machines + * without the RMPREAD instruction (Zen3 and Zen4), otherwise the "hi" and "lo" + * fields are only used for dumping the raw data. + */ +struct rmpentry_raw { union { struct { u64 assigned : 1, @@ -62,7 +81,7 @@ struct rmpentry { #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) static u64 probed_rmp_base, probed_rmp_size; -static struct rmpentry *rmptable __ro_after_init; +static struct rmpentry_raw *rmptable __ro_after_init; static u64 rmptable_max_pfn __ro_after_init; static LIST_HEAD(snp_leaked_pages_list); @@ -249,8 +268,8 @@ skip_enable: rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ; rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; - rmptable = (struct rmpentry *)rmptable_start; - rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1; + rmptable = (struct rmpentry_raw *)rmptable_start; + rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry_raw) - 1; cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); @@ -272,48 +291,77 @@ nosnp: */ device_initcall(snp_rmptable_init); -static struct rmpentry *get_rmpentry(u64 pfn) +static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) { - if (WARN_ON_ONCE(pfn > rmptable_max_pfn)) - return ERR_PTR(-EFAULT); - - return &rmptable[pfn]; -} - -static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) -{ - struct rmpentry *large_entry, *entry; - - if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + if (!rmptable) return ERR_PTR(-ENODEV); - entry = get_rmpentry(pfn); - if (IS_ERR(entry)) - return entry; + if (unlikely(pfn > rmptable_max_pfn)) + return ERR_PTR(-EFAULT); + + return rmptable + pfn; +} + +static int get_rmpentry(u64 pfn, struct rmpentry *e) +{ + struct rmpentry_raw *e_raw; + + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) + return PTR_ERR(e_raw); + + /* + * Map the raw RMP table entry onto the RMPREAD output format. + * The 2MB region status indicator (hpage_region_status field) is not + * calculated, since the overhead could be significant and the field + * is not used. + */ + memset(e, 0, sizeof(*e)); + e->gpa = e_raw->gpa << PAGE_SHIFT; + e->asid = e_raw->asid; + e->assigned = e_raw->assigned; + e->pagesize = e_raw->pagesize; + e->immutable = e_raw->immutable; + + return 0; +} + +static int __snp_lookup_rmpentry(u64 pfn, struct rmpentry *e, int *level) +{ + struct rmpentry e_large; + int ret; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return -ENODEV; + + ret = get_rmpentry(pfn, e); + if (ret) + return ret; /* * Find the authoritative RMP entry for a PFN. This can be either a 4K * RMP entry or a special large RMP entry that is authoritative for a * whole 2M area. */ - large_entry = get_rmpentry(pfn & PFN_PMD_MASK); - if (IS_ERR(large_entry)) - return large_entry; + ret = get_rmpentry(pfn & PFN_PMD_MASK, &e_large); + if (ret) + return ret; - *level = RMP_TO_PG_LEVEL(large_entry->pagesize); + *level = RMP_TO_PG_LEVEL(e_large.pagesize); - return entry; + return 0; } int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { - struct rmpentry *e; + struct rmpentry e; + int ret; - e = __snp_lookup_rmpentry(pfn, level); - if (IS_ERR(e)) - return PTR_ERR(e); + ret = __snp_lookup_rmpentry(pfn, &e, level); + if (ret) + return ret; - *assigned = !!e->assigned; + *assigned = !!e.assigned; return 0; } EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); @@ -326,20 +374,28 @@ EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); */ static void dump_rmpentry(u64 pfn) { + struct rmpentry_raw *e_raw; u64 pfn_i, pfn_end; - struct rmpentry *e; - int level; + struct rmpentry e; + int level, ret; - e = __snp_lookup_rmpentry(pfn, &level); - if (IS_ERR(e)) { - pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n", - pfn, PTR_ERR(e)); + ret = __snp_lookup_rmpentry(pfn, &e, &level); + if (ret) { + pr_err("Failed to read RMP entry for PFN 0x%llx, error %d\n", + pfn, ret); return; } - if (e->assigned) { + if (e.assigned) { + e_raw = get_raw_rmpentry(pfn); + if (IS_ERR(e_raw)) { + pr_err("Failed to read RMP contents for PFN 0x%llx, error %ld\n", + pfn, PTR_ERR(e_raw)); + return; + } + pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n", - pfn, e->lo, e->hi); + pfn, e_raw->lo, e_raw->hi); return; } @@ -358,16 +414,16 @@ static void dump_rmpentry(u64 pfn) pfn, pfn_i, pfn_end); while (pfn_i < pfn_end) { - e = __snp_lookup_rmpentry(pfn_i, &level); - if (IS_ERR(e)) { - pr_err("Error %ld reading RMP entry for PFN 0x%llx\n", - PTR_ERR(e), pfn_i); + e_raw = get_raw_rmpentry(pfn_i); + if (IS_ERR(e_raw)) { + pr_err("Error %ld reading RMP contents for PFN 0x%llx\n", + PTR_ERR(e_raw), pfn_i); pfn_i++; continue; } - if (e->lo || e->hi) - pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi); + if (e_raw->lo || e_raw->hi) + pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e_raw->lo, e_raw->hi); pfn_i++; } } From 0cbc0258415814c86eb6db50237ae3d90fbf3b3d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:47 -0600 Subject: [PATCH 126/224] x86/sev: Add support for the RMPREAD instruction The RMPREAD instruction returns an architecture defined format of an RMP table entry. This is the preferred method for examining RMP entries. The instruction is advertised in CPUID 0x8000001f_EAX[21]. Use this instruction when available. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Reviewed-by: Ashish Kalra Link: https://lore.kernel.org/r/72c734ac8b324bbc0c839b2c093a11af4a8881fa.1733172653.git.thomas.lendacky@amd.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/virt/svm/sev.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..5535edc6e8d7 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -451,6 +451,7 @@ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index cf64e9384ea0..18191cbd78c5 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -306,6 +306,18 @@ static int get_rmpentry(u64 pfn, struct rmpentry *e) { struct rmpentry_raw *e_raw; + if (cpu_feature_enabled(X86_FEATURE_RMPREAD)) { + int ret; + + /* Binutils version 2.44 supports the RMPREAD mnemonic. */ + asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfd" + : "=a" (ret) + : "a" (pfn << PAGE_SHIFT), "c" (e) + : "memory", "cc"); + + return ret; + } + e_raw = get_raw_rmpentry(pfn); if (IS_ERR(e_raw)) return PTR_ERR(e_raw); From 4972808d6f4a2b4c10eb3035d769f2e1a003da2f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:48 -0600 Subject: [PATCH 127/224] x86/sev: Require the RMPREAD instruction after Zen4 Limit usage of the non-architectural RMP format to Zen3/Zen4 processors. The RMPREAD instruction, with architectural defined output, is available and should be used for RMP access beyond Zen4. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Reviewed-by: Ashish Kalra Link: https://lore.kernel.org/r/5be0093e091778a151266ea853352f62f838eb99.1733172653.git.thomas.lendacky@amd.com --- arch/x86/kernel/cpu/amd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 79d2e17f6582..b9592c60166e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -355,10 +355,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c) /* * RMP table entry format is not architectural and is defined by the * per-processor PPR. Restrict SNP support on the known CPU models - * for which the RMP table entry format is currently defined for. + * for which the RMP table entry format is currently defined or for + * processors which support the architecturally defined RMPREAD + * instruction. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && - c->x86 >= 0x19 && snp_probe_rmptable_info()) { + (cpu_feature_enabled(X86_FEATURE_ZEN3) || + cpu_feature_enabled(X86_FEATURE_ZEN4) || + cpu_feature_enabled(X86_FEATURE_RMPREAD)) && + snp_probe_rmptable_info()) { cc_platform_set(CC_ATTR_HOST_SEV_SNP); } else { setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); From e2f3d40df82eeb70f6c3602418bca63c54183776 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:49 -0600 Subject: [PATCH 128/224] x86/sev: Move the SNP probe routine out of the way To make patch review easier for the segmented RMP support, move the SNP probe function out from in between the initialization-related routines. No functional change. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/6c2975bbf132d567dd12e1435be1d18c0bf9131c.1733172653.git.thomas.lendacky@amd.com --- arch/x86/virt/svm/sev.c | 60 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 18191cbd78c5..0df378951ac7 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -135,36 +135,6 @@ static __init void snp_enable(void *arg) __snp_enable(smp_processor_id()); } -#define RMP_ADDR_MASK GENMASK_ULL(51, 13) - -bool snp_probe_rmptable_info(void) -{ - u64 rmp_sz, rmp_base, rmp_end; - - rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); - rdmsrl(MSR_AMD64_RMP_END, rmp_end); - - if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { - pr_err("Memory for the RMP table has not been reserved by BIOS\n"); - return false; - } - - if (rmp_base > rmp_end) { - pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); - return false; - } - - rmp_sz = rmp_end - rmp_base + 1; - - probed_rmp_base = rmp_base; - probed_rmp_size = rmp_sz; - - pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", - rmp_base, rmp_end); - - return true; -} - static void __init __snp_fixup_e820_tables(u64 pa) { if (IS_ALIGNED(pa, PMD_SIZE)) @@ -291,6 +261,36 @@ nosnp: */ device_initcall(snp_rmptable_init); +#define RMP_ADDR_MASK GENMASK_ULL(51, 13) + +bool snp_probe_rmptable_info(void) +{ + u64 rmp_sz, rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + + if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + if (rmp_base > rmp_end) { + pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end); + return false; + } + + rmp_sz = rmp_end - rmp_base + 1; + + probed_rmp_base = rmp_base; + probed_rmp_size = rmp_sz; + + pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_end); + + return true; +} + static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) { if (!rmptable) From ac517965a5a12d685f1e7a7f77e64503167f87d5 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:50 -0600 Subject: [PATCH 129/224] x86/sev: Map only the RMP table entries instead of the full RMP range In preparation for support of a segmented RMP table, map only the RMP table entries. The RMP bookkeeping area is only ever accessed when first enabling SNP and does not need to remain mapped. To accomplish this, split the initialization of the RMP bookkeeping area and the initialization of the RMP entry area. The RMP bookkeeping area will be mapped only while it is being initialized. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Reviewed-by: Ashish Kalra Link: https://lore.kernel.org/r/22f179998d319834f49c13a8c01187fbf0fd308d.1733172653.git.thomas.lendacky@amd.com --- arch/x86/virt/svm/sev.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 0df378951ac7..2899c2e28db9 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -173,6 +173,23 @@ void __init snp_fixup_e820_tables(void) __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); } +static bool __init clear_rmptable_bookkeeping(void) +{ + void *bk; + + bk = memremap(probed_rmp_base, RMPTABLE_CPU_BOOKKEEPING_SZ, MEMREMAP_WB); + if (!bk) { + pr_err("Failed to map RMP bookkeeping area\n"); + return false; + } + + memset(bk, 0, RMPTABLE_CPU_BOOKKEEPING_SZ); + + memunmap(bk); + + return true; +} + /* * Do the necessary preparations which are verified by the firmware as * described in the SNP_INIT_EX firmware command description in the SNP @@ -210,12 +227,17 @@ static int __init snp_rmptable_init(void) goto nosnp; } - rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB); + /* Map only the RMP entries */ + rmptable_start = memremap(probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ, + probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ, + MEMREMAP_WB); if (!rmptable_start) { pr_err("Failed to map RMP table\n"); goto nosnp; } + rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; + /* * Check if SEV-SNP is already enabled, this can happen in case of * kexec boot. @@ -224,7 +246,14 @@ static int __init snp_rmptable_init(void) if (val & MSR_AMD64_SYSCFG_SNP_EN) goto skip_enable; - memset(rmptable_start, 0, probed_rmp_size); + /* Zero out the RMP bookkeeping area */ + if (!clear_rmptable_bookkeeping()) { + memunmap(rmptable_start); + goto nosnp; + } + + /* Zero out the RMP entries */ + memset(rmptable_start, 0, rmptable_size); /* Flush the caches to ensure that data is written before SNP is enabled. */ wbinvd_on_all_cpus(); @@ -235,9 +264,6 @@ static int __init snp_rmptable_init(void) on_each_cpu(snp_enable, NULL, 1); skip_enable: - rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ; - rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; - rmptable = (struct rmpentry_raw *)rmptable_start; rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry_raw) - 1; From 0f14af0d1d7df0086b1be98d2cea1cad4b8c826f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:51 -0600 Subject: [PATCH 130/224] x86/sev: Treat the contiguous RMP table as a single RMP segment In preparation for support of a segmented RMP table, treat the contiguous RMP table as a segmented RMP table with a single segment covering all of memory. By treating a contiguous RMP table as a single segment, much of the code that initializes and accesses the RMP can be re-used. Segmented RMP tables can have up to 512 segment entries. Each segment will have metadata associated with it to identify the segment location, the segment size, etc. The segment data and the physical address are used to determine the index of the segment within the table and then the RMP entry within the segment. For an actual segmented RMP table environment, much of the segment information will come from a configuration MSR. For the contiguous RMP, though, much of the information will be statically defined. [ bp: Touchups, explain array_index_nospec() usage. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/8c40fbc9c5217f0d79b37cf861eff03ab0330bef.1733172653.git.thomas.lendacky@amd.com --- arch/x86/virt/svm/sev.c | 199 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 180 insertions(+), 19 deletions(-) diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 2899c2e28db9..e50b71c67fab 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -77,12 +78,42 @@ struct rmpentry_raw { */ #define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 +/* + * For a non-segmented RMP table, use the maximum physical addressing as the + * segment size in order to always arrive at index 0 in the table. + */ +#define RMPTABLE_NON_SEGMENTED_SHIFT 52 + +struct rmp_segment_desc { + struct rmpentry_raw *rmp_entry; + u64 max_index; + u64 size; +}; + +/* + * Segmented RMP Table support. + * - The segment size is used for two purposes: + * - Identify the amount of memory covered by an RMP segment + * - Quickly locate an RMP segment table entry for a physical address + * + * - The RMP segment table contains pointers to an RMP table that covers + * a specific portion of memory. There can be up to 512 8-byte entries, + * one pages worth. + */ +static struct rmp_segment_desc **rmp_segment_table __ro_after_init; +static unsigned int rst_max_index __ro_after_init = 512; + +static unsigned int rmp_segment_shift; +static u64 rmp_segment_size; +static u64 rmp_segment_mask; + +#define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) +#define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) + /* Mask to apply to a PFN to get the first PFN of a 2MB page */ #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) static u64 probed_rmp_base, probed_rmp_size; -static struct rmpentry_raw *rmptable __ro_after_init; -static u64 rmptable_max_pfn __ro_after_init; static LIST_HEAD(snp_leaked_pages_list); static DEFINE_SPINLOCK(snp_leaked_pages_list_lock); @@ -190,6 +221,92 @@ static bool __init clear_rmptable_bookkeeping(void) return true; } +static bool __init alloc_rmp_segment_desc(u64 segment_pa, u64 segment_size, u64 pa) +{ + u64 rst_index, rmp_segment_size_max; + struct rmp_segment_desc *desc; + void *rmp_segment; + + /* Calculate the maximum size an RMP can be (16 bytes/page mapped) */ + rmp_segment_size_max = PHYS_PFN(rmp_segment_size) << 4; + + /* Validate the RMP segment size */ + if (segment_size > rmp_segment_size_max) { + pr_err("Invalid RMP size 0x%llx for configured segment size 0x%llx\n", + segment_size, rmp_segment_size_max); + return false; + } + + /* Validate the RMP segment table index */ + rst_index = RST_ENTRY_INDEX(pa); + if (rst_index >= rst_max_index) { + pr_err("Invalid RMP segment base address 0x%llx for configured segment size 0x%llx\n", + pa, rmp_segment_size); + return false; + } + + if (rmp_segment_table[rst_index]) { + pr_err("RMP segment descriptor already exists at index %llu\n", rst_index); + return false; + } + + rmp_segment = memremap(segment_pa, segment_size, MEMREMAP_WB); + if (!rmp_segment) { + pr_err("Failed to map RMP segment addr 0x%llx size 0x%llx\n", + segment_pa, segment_size); + return false; + } + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + memunmap(rmp_segment); + return false; + } + + desc->rmp_entry = rmp_segment; + desc->max_index = segment_size / sizeof(*desc->rmp_entry); + desc->size = segment_size; + + rmp_segment_table[rst_index] = desc; + + return true; +} + +static void __init free_rmp_segment_table(void) +{ + unsigned int i; + + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memunmap(desc->rmp_entry); + + kfree(desc); + } + + free_page((unsigned long)rmp_segment_table); + + rmp_segment_table = NULL; +} + +/* Allocate the table used to index into the RMP segments */ +static bool __init alloc_rmp_segment_table(void) +{ + struct page *page; + + page = alloc_page(__GFP_ZERO); + if (!page) + return false; + + rmp_segment_table = page_address(page); + + return true; +} + /* * Do the necessary preparations which are verified by the firmware as * described in the SNP_INIT_EX firmware command description in the SNP @@ -197,8 +314,8 @@ static bool __init clear_rmptable_bookkeeping(void) */ static int __init snp_rmptable_init(void) { - u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val; - void *rmptable_start; + u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end, val; + unsigned int i; if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; @@ -227,17 +344,18 @@ static int __init snp_rmptable_init(void) goto nosnp; } + if (!alloc_rmp_segment_table()) + goto nosnp; + /* Map only the RMP entries */ - rmptable_start = memremap(probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ, - probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ, - MEMREMAP_WB); - if (!rmptable_start) { - pr_err("Failed to map RMP table\n"); + rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; + + if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { + free_rmp_segment_table(); goto nosnp; } - rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ; - /* * Check if SEV-SNP is already enabled, this can happen in case of * kexec boot. @@ -248,12 +366,20 @@ static int __init snp_rmptable_init(void) /* Zero out the RMP bookkeeping area */ if (!clear_rmptable_bookkeeping()) { - memunmap(rmptable_start); + free_rmp_segment_table(); goto nosnp; } /* Zero out the RMP entries */ - memset(rmptable_start, 0, rmptable_size); + for (i = 0; i < rst_max_index; i++) { + struct rmp_segment_desc *desc; + + desc = rmp_segment_table[i]; + if (!desc) + continue; + + memset(desc->rmp_entry, 0, desc->size); + } /* Flush the caches to ensure that data is written before SNP is enabled. */ wbinvd_on_all_cpus(); @@ -264,9 +390,6 @@ static int __init snp_rmptable_init(void) on_each_cpu(snp_enable, NULL, 1); skip_enable: - rmptable = (struct rmpentry_raw *)rmptable_start; - rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry_raw) - 1; - cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); /* @@ -287,6 +410,13 @@ nosnp: */ device_initcall(snp_rmptable_init); +static void set_rmp_segment_info(unsigned int segment_shift) +{ + rmp_segment_shift = segment_shift; + rmp_segment_size = 1ULL << rmp_segment_shift; + rmp_segment_mask = rmp_segment_size - 1; +} + #define RMP_ADDR_MASK GENMASK_ULL(51, 13) bool snp_probe_rmptable_info(void) @@ -308,6 +438,11 @@ bool snp_probe_rmptable_info(void) rmp_sz = rmp_end - rmp_base + 1; + /* Treat the contiguous RMP table as a single segment */ + rst_max_index = 1; + + set_rmp_segment_info(RMPTABLE_NON_SEGMENTED_SHIFT); + probed_rmp_base = rmp_base; probed_rmp_size = rmp_sz; @@ -317,15 +452,41 @@ bool snp_probe_rmptable_info(void) return true; } +/* + * About the array_index_nospec() usage below: + * + * This function can get called by exported functions like + * snp_lookup_rmpentry(), which is used by the KVM #PF handler, among + * others, and since the @pfn passed in cannot always be trusted, + * speculation should be stopped as a protective measure. + */ static struct rmpentry_raw *get_raw_rmpentry(u64 pfn) { - if (!rmptable) + u64 paddr, rst_index, segment_index; + struct rmp_segment_desc *desc; + + if (!rmp_segment_table) return ERR_PTR(-ENODEV); - if (unlikely(pfn > rmptable_max_pfn)) + paddr = pfn << PAGE_SHIFT; + + rst_index = RST_ENTRY_INDEX(paddr); + if (unlikely(rst_index >= rst_max_index)) return ERR_PTR(-EFAULT); - return rmptable + pfn; + rst_index = array_index_nospec(rst_index, rst_max_index); + + desc = rmp_segment_table[rst_index]; + if (unlikely(!desc)) + return ERR_PTR(-EFAULT); + + segment_index = RMP_ENTRY_INDEX(paddr); + if (unlikely(segment_index >= desc->max_index)) + return ERR_PTR(-EFAULT); + + segment_index = array_index_nospec(segment_index, desc->max_index); + + return desc->rmp_entry + segment_index; } static int get_rmpentry(u64 pfn, struct rmpentry *e) From 8ae3291f773befee8fdeae11b0b1b5d380e4dfb6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 13 Dec 2024 08:55:42 -0600 Subject: [PATCH 131/224] x86/sev: Add full support for a segmented RMP table A segmented RMP table allows for improved locality of reference between the memory protected by the RMP and the RMP entries themselves. Add support to detect and initialize a segmented RMP table with multiple segments as configured by the system BIOS. While the RMPREAD instruction will be used to read an RMP entry in a segmented RMP, initialization and debugging capabilities will require the mapping of the segments. The RMP_CFG MSR indicates if segmented RMP support is enabled and, if enabled, the amount of memory that an RMP segment covers. When segmented RMP support is enabled, the RMP_BASE MSR points to the start of the RMP bookkeeping area, which is 16K in size. The RMP Segment Table (RST) is located immediately after the bookkeeping area and is 4K in size. The RST contains up to 512 8-byte entries that identify the location of the RMP segment and amount of memory mapped by the segment (which must be less than or equal to the configured segment size). The physical address that is covered by a segment is based on the segment size and the index of the segment in the RST. The RMP entry for a physical address is based on the offset within the segment. For example, if the segment size is 64GB (0x1000000000 or 1 << 36), then physical address 0x9000800000 is RST entry 9 (0x9000800000 >> 36) and RST entry 9 covers physical memory 0x9000000000 to 0x9FFFFFFFFF. The RMP entry index within the RMP segment is the physical address AND-ed with the segment mask, 64GB - 1 (0xFFFFFFFFF), and then right-shifted 12 bits or PHYS_PFN(0x9000800000 & 0xFFFFFFFFF), which is 0x800. CPUID 0x80000025_EBX[9:0] describes the number of RMP segments that can be cached by the hardware. Additionally, if CPUID 0x80000025_EBX[10] is set, then the number of actual RMP segments defined cannot exceed the number of RMP segments that can be cached and can be used as a maximum RST index. [ bp: Unify printk hex format specifiers. ] Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/02afd0ffd097a19cb6e5fb1bb76eb110496c5b11.1734101742.git.thomas.lendacky@amd.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 8 +- arch/x86/virt/svm/sev.c | 260 ++++++++++++++++++++++++++--- 3 files changed, 245 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5535edc6e8d7..6a6db7cd97cb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -452,6 +452,7 @@ #define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ +#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3ae84c3b8e6d..3f3e2bc99162 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -644,6 +644,7 @@ #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ #define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 @@ -682,11 +683,12 @@ #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) #define MSR_AMD64_SNP_RESV_BIT 18 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) - -#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f - #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 +#define MSR_AMD64_RMP_CFG 0xc0010136 +#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0 +#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT) +#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8) #define MSR_SVSM_CAA 0xc001f000 diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index e50b71c67fab..1dcc027ec77e 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -100,6 +100,10 @@ struct rmp_segment_desc { * a specific portion of memory. There can be up to 512 8-byte entries, * one pages worth. */ +#define RST_ENTRY_MAPPED_SIZE(x) ((x) & GENMASK_ULL(19, 0)) +#define RST_ENTRY_SEGMENT_BASE(x) ((x) & GENMASK_ULL(51, 20)) + +#define RST_SIZE SZ_4K static struct rmp_segment_desc **rmp_segment_table __ro_after_init; static unsigned int rst_max_index __ro_after_init = 512; @@ -110,6 +114,8 @@ static u64 rmp_segment_mask; #define RST_ENTRY_INDEX(x) ((x) >> rmp_segment_shift) #define RMP_ENTRY_INDEX(x) ((u64)(PHYS_PFN((x) & rmp_segment_mask))) +static u64 rmp_cfg; + /* Mask to apply to a PFN to get the first PFN of a 2MB page */ #define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT) @@ -198,12 +204,62 @@ static void __init __snp_fixup_e820_tables(u64 pa) } } -void __init snp_fixup_e820_tables(void) +static void __init fixup_e820_tables_for_segmented_rmp(void) +{ + u64 pa, *rst, size, mapped_size; + unsigned int i; + + __snp_fixup_e820_tables(probed_rmp_base); + + pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + + __snp_fixup_e820_tables(pa + RST_SIZE); + + rst = early_memremap(pa, RST_SIZE); + if (!rst) + return; + + for (i = 0; i < rst_max_index; i++) { + pa = RST_ENTRY_SEGMENT_BASE(rst[i]); + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + __snp_fixup_e820_tables(pa); + + /* + * Mapped size in GB. Mapped size is allowed to exceed + * the segment coverage size, but gets reduced to the + * segment coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) + mapped_size = rmp_segment_size; + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + size = PHYS_PFN(mapped_size) << 4; + + __snp_fixup_e820_tables(pa + size); + } + + early_memunmap(rst, RST_SIZE); +} + +static void __init fixup_e820_tables_for_contiguous_rmp(void) { __snp_fixup_e820_tables(probed_rmp_base); __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); } +void __init snp_fixup_e820_tables(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + fixup_e820_tables_for_segmented_rmp(); + } else { + fixup_e820_tables_for_contiguous_rmp(); + } +} + static bool __init clear_rmptable_bookkeeping(void) { void *bk; @@ -307,29 +363,17 @@ static bool __init alloc_rmp_segment_table(void) return true; } -/* - * Do the necessary preparations which are verified by the firmware as - * described in the SNP_INIT_EX firmware command description in the SNP - * firmware ABI spec. - */ -static int __init snp_rmptable_init(void) +static bool __init setup_contiguous_rmptable(void) { - u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end, val; - unsigned int i; - - if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) - return 0; - - if (!amd_iommu_snp_en) - goto nosnp; + u64 max_rmp_pfn, calc_rmp_sz, rmptable_segment, rmptable_size, rmp_end; if (!probed_rmp_size) - goto nosnp; + return false; rmp_end = probed_rmp_base + probed_rmp_size - 1; /* - * Calculate the amount the memory that must be reserved by the BIOS to + * Calculate the amount of memory that must be reserved by the BIOS to * address the whole RAM, including the bookkeeping area. The RMP itself * must also be covered. */ @@ -341,11 +385,11 @@ static int __init snp_rmptable_init(void) if (calc_rmp_sz > probed_rmp_size) { pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", calc_rmp_sz, probed_rmp_size); - goto nosnp; + return false; } if (!alloc_rmp_segment_table()) - goto nosnp; + return false; /* Map only the RMP entries */ rmptable_segment = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; @@ -353,9 +397,128 @@ static int __init snp_rmptable_init(void) if (!alloc_rmp_segment_desc(rmptable_segment, rmptable_size, 0)) { free_rmp_segment_table(); - goto nosnp; + return false; } + return true; +} + +static bool __init setup_segmented_rmptable(void) +{ + u64 rst_pa, *rst, pa, ram_pa_end, ram_pa_max; + unsigned int i, max_index; + + if (!probed_rmp_base) + return false; + + if (!alloc_rmp_segment_table()) + return false; + + rst_pa = probed_rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ; + rst = memremap(rst_pa, RST_SIZE, MEMREMAP_WB); + if (!rst) { + pr_err("Failed to map RMP segment table addr 0x%llx\n", rst_pa); + goto e_free; + } + + pr_info("Segmented RMP using %lluGB segments\n", rmp_segment_size >> 30); + + ram_pa_max = max_pfn << PAGE_SHIFT; + + max_index = 0; + ram_pa_end = 0; + for (i = 0; i < rst_max_index; i++) { + u64 rmp_segment, rmp_size, mapped_size; + + mapped_size = RST_ENTRY_MAPPED_SIZE(rst[i]); + if (!mapped_size) + continue; + + max_index = i; + + /* + * Mapped size in GB. Mapped size is allowed to exceed the + * segment coverage size, but gets reduced to the segment + * coverage size. + */ + mapped_size <<= 30; + if (mapped_size > rmp_segment_size) { + pr_info("RMP segment %u mapped size (0x%llx) reduced to 0x%llx\n", + i, mapped_size, rmp_segment_size); + mapped_size = rmp_segment_size; + } + + rmp_segment = RST_ENTRY_SEGMENT_BASE(rst[i]); + + /* Calculate the RMP segment size (16 bytes/page mapped) */ + rmp_size = PHYS_PFN(mapped_size) << 4; + + pa = (u64)i << rmp_segment_shift; + + /* + * Some segments may be for MMIO mapped above system RAM. These + * segments are used for Trusted I/O. + */ + if (pa < ram_pa_max) + ram_pa_end = pa + mapped_size; + + if (!alloc_rmp_segment_desc(rmp_segment, rmp_size, pa)) + goto e_unmap; + + pr_info("RMP segment %u physical address [0x%llx - 0x%llx] covering [0x%llx - 0x%llx]\n", + i, rmp_segment, rmp_segment + rmp_size - 1, pa, pa + mapped_size - 1); + } + + if (ram_pa_max > ram_pa_end) { + pr_err("Segmented RMP does not cover full system RAM (expected 0x%llx got 0x%llx)\n", + ram_pa_max, ram_pa_end); + goto e_unmap; + } + + /* Adjust the maximum index based on the found segments */ + rst_max_index = max_index + 1; + + memunmap(rst); + + return true; + +e_unmap: + memunmap(rst); + +e_free: + free_rmp_segment_table(); + + return false; +} + +static bool __init setup_rmptable(void) +{ + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) { + return setup_segmented_rmptable(); + } else { + return setup_contiguous_rmptable(); + } +} + +/* + * Do the necessary preparations which are verified by the firmware as + * described in the SNP_INIT_EX firmware command description in the SNP + * firmware ABI spec. + */ +static int __init snp_rmptable_init(void) +{ + unsigned int i; + u64 val; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!amd_iommu_snp_en) + goto nosnp; + + if (!setup_rmptable()) + goto nosnp; + /* * Check if SEV-SNP is already enabled, this can happen in case of * kexec boot. @@ -419,7 +582,7 @@ static void set_rmp_segment_info(unsigned int segment_shift) #define RMP_ADDR_MASK GENMASK_ULL(51, 13) -bool snp_probe_rmptable_info(void) +static bool probe_contiguous_rmptable_info(void) { u64 rmp_sz, rmp_base, rmp_end; @@ -452,6 +615,61 @@ bool snp_probe_rmptable_info(void) return true; } +static bool probe_segmented_rmptable_info(void) +{ + unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; + u64 rmp_base, rmp_end; + + rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + if (!(rmp_base & RMP_ADDR_MASK)) { + pr_err("Memory for the RMP table has not been reserved by BIOS\n"); + return false; + } + + rdmsrl(MSR_AMD64_RMP_END, rmp_end); + WARN_ONCE(rmp_end & RMP_ADDR_MASK, + "Segmented RMP enabled but RMP_END MSR is non-zero\n"); + + /* Obtain the min and max supported RMP segment size */ + eax = cpuid_eax(0x80000025); + segment_shift_min = eax & GENMASK(5, 0); + segment_shift_max = (eax & GENMASK(11, 6)) >> 6; + + /* Verify the segment size is within the supported limits */ + segment_shift = MSR_AMD64_RMP_SEGMENT_SHIFT(rmp_cfg); + if (segment_shift > segment_shift_max || segment_shift < segment_shift_min) { + pr_err("RMP segment size (%u) is not within advertised bounds (min=%u, max=%u)\n", + segment_shift, segment_shift_min, segment_shift_max); + return false; + } + + /* Override the max supported RST index if a hardware limit exists */ + ebx = cpuid_ebx(0x80000025); + if (ebx & BIT(10)) + rst_max_index = ebx & GENMASK(9, 0); + + set_rmp_segment_info(segment_shift); + + probed_rmp_base = rmp_base; + probed_rmp_size = 0; + + pr_info("Segmented RMP base table physical range [0x%016llx - 0x%016llx]\n", + rmp_base, rmp_base + RMPTABLE_CPU_BOOKKEEPING_SZ + RST_SIZE); + + return true; +} + +bool snp_probe_rmptable_info(void) +{ + if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) + rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg); + + if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) + return probe_segmented_rmptable_info(); + else + return probe_contiguous_rmptable_info(); +} + /* * About the array_index_nospec() usage below: * From 21fc6178e92070523e70fc5db59ac83806d269d6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 2 Dec 2024 14:50:53 -0600 Subject: [PATCH 132/224] x86/sev/docs: Document the SNP Reverse Map Table (RMP) Update the AMD memory encryption documentation to include information on the Reverse Map Table (RMP) and the two table formats. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/d3feea54912ad9ff2fc261223db691ca11fc547f.1733172653.git.thomas.lendacky@amd.com --- .../arch/x86/amd-memory-encryption.rst | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst index 6df3264f23b9..bd840df708ea 100644 --- a/Documentation/arch/x86/amd-memory-encryption.rst +++ b/Documentation/arch/x86/amd-memory-encryption.rst @@ -130,8 +130,126 @@ SNP feature support. More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR +Reverse Map Table (RMP) +======================= + +The RMP is a structure in system memory that is used to ensure a one-to-one +mapping between system physical addresses and guest physical addresses. Each +page of memory that is potentially assignable to guests has one entry within +the RMP. + +The RMP table can be either contiguous in memory or a collection of segments +in memory. + +Contiguous RMP +-------------- + +Support for this form of the RMP is present when support for SEV-SNP is +present, which can be determined using the CPUID instruction:: + + 0x8000001f[eax]: + Bit[4] indicates support for SEV-SNP + +The location of the RMP is identified to the hardware through two MSRs:: + + 0xc0010132 (RMP_BASE): + System physical address of the first byte of the RMP + + 0xc0010133 (RMP_END): + System physical address of the last byte of the RMP + +Hardware requires that RMP_BASE and (RPM_END + 1) be 8KB aligned, but SEV +firmware increases the alignment requirement to require a 1MB alignment. + +The RMP consists of a 16KB region used for processor bookkeeping followed +by the RMP entries, which are 16 bytes in size. The size of the RMP +determines the range of physical memory that the hypervisor can assign to +SEV-SNP guests. The RMP covers the system physical address from:: + + 0 to ((RMP_END + 1 - RMP_BASE - 16KB) / 16B) x 4KB. + +The current Linux support relies on BIOS to allocate/reserve the memory for +the RMP and to set RMP_BASE and RMP_END appropriately. Linux uses the MSR +values to locate the RMP and determine the size of the RMP. The RMP must +cover all of system memory in order for Linux to enable SEV-SNP. + +Segmented RMP +------------- + +Segmented RMP support is a new way of representing the layout of an RMP. +Initial RMP support required the RMP table to be contiguous in memory. +RMP accesses from a NUMA node on which the RMP doesn't reside +can take longer than accesses from a NUMA node on which the RMP resides. +Segmented RMP support allows the RMP entries to be located on the same +node as the memory the RMP is covering, potentially reducing latency +associated with accessing an RMP entry associated with the memory. Each +RMP segment covers a specific range of system physical addresses. + +Support for this form of the RMP can be determined using the CPUID +instruction:: + + 0x8000001f[eax]: + Bit[23] indicates support for segmented RMP + +If supported, segmented RMP attributes can be found using the CPUID +instruction:: + + 0x80000025[eax]: + Bits[5:0] minimum supported RMP segment size + Bits[11:6] maximum supported RMP segment size + + 0x80000025[ebx]: + Bits[9:0] number of cacheable RMP segment definitions + Bit[10] indicates if the number of cacheable RMP segments + is a hard limit + +To enable a segmented RMP, a new MSR is available:: + + 0xc0010136 (RMP_CFG): + Bit[0] indicates if segmented RMP is enabled + Bits[13:8] contains the size of memory covered by an RMP + segment (expressed as a power of 2) + +The RMP segment size defined in the RMP_CFG MSR applies to all segments +of the RMP. Therefore each RMP segment covers a specific range of system +physical addresses. For example, if the RMP_CFG MSR value is 0x2401, then +the RMP segment coverage value is 0x24 => 36, meaning the size of memory +covered by an RMP segment is 64GB (1 << 36). So the first RMP segment +covers physical addresses from 0 to 0xF_FFFF_FFFF, the second RMP segment +covers physical addresses from 0x10_0000_0000 to 0x1F_FFFF_FFFF, etc. + +When a segmented RMP is enabled, RMP_BASE points to the RMP bookkeeping +area as it does today (16K in size). However, instead of RMP entries +beginning immediately after the bookkeeping area, there is a 4K RMP +segment table (RST). Each entry in the RST is 8-bytes in size and represents +an RMP segment:: + + Bits[19:0] mapped size (in GB) + The mapped size can be less than the defined segment size. + A value of zero, indicates that no RMP exists for the range + of system physical addresses associated with this segment. + Bits[51:20] segment physical address + This address is left shift 20-bits (or just masked when + read) to form the physical address of the segment (1MB + alignment). + +The RST can hold 512 segment entries but can be limited in size to the number +of cacheable RMP segments (CPUID 0x80000025_EBX[9:0]) if the number of cacheable +RMP segments is a hard limit (CPUID 0x80000025_EBX[10]). + +The current Linux support relies on BIOS to allocate/reserve the memory for +the segmented RMP (the bookkeeping area, RST, and all segments), build the RST +and to set RMP_BASE, RMP_END, and RMP_CFG appropriately. Linux uses the MSR +values to locate the RMP and determine the size and location of the RMP +segments. The RMP must cover all of system memory in order for Linux to enable +SEV-SNP. + +More details in the AMD64 APM Vol 2, section "15.36.3 Reverse Map Table", +docID: 24593. + Secure VM Service Module (SVSM) =============================== + SNP provides a feature called Virtual Machine Privilege Levels (VMPL) which defines four privilege levels at which guest software can run. The most privileged level is 0 and numerically higher numbers have lesser privileges. From 0d3547df6934b8f9600630322799a2a76b4567d8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 31 Jul 2024 15:58:51 +0200 Subject: [PATCH 133/224] locking/ww_mutex/test: Use swap() macro Fixes the following Coccinelle/coccicheck warning reported by swap.cocci: WARNING opportunity for swap() Compile-tested only. [Boqun: Add the report tags from Jiapeng and Abaci Robot [1].] Reported-by: Abaci Robot Reported-by: Jiapeng Chong Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=11531 Link: https://lore.kernel.org/r/20241025081455.55089-1-jiapeng.chong@linux.alibaba.com [1] Acked-by: Waiman Long Signed-off-by: Thorsten Blum Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20240731135850.81018-2-thorsten.blum@toblux.com --- kernel/locking/test-ww_mutex.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5d58b2c0ef98..bcb1b9fea588 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -404,7 +404,7 @@ static inline u32 prandom_u32_below(u32 ceil) static int *get_random_order(int count) { int *order; - int n, r, tmp; + int n, r; order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) @@ -415,11 +415,8 @@ static int *get_random_order(int count) for (n = count - 1; n > 1; n--) { r = prandom_u32_below(n + 1); - if (r != n) { - tmp = order[n]; - order[n] = order[r]; - order[r] = tmp; - } + if (r != n) + swap(order[n], order[r]); } return order; From e638072e61726cae363d48812815197a2a0e097f Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 24 Oct 2024 18:36:26 +0000 Subject: [PATCH 134/224] lockdep: Fix upper limit for LOCKDEP_*_BITS configs Lockdep has a set of configs used to determine the size of the static arrays that it uses. However, the upper limit that was initially setup for these configs is too high (30 bit shift). This equates to several GiB of static memory for individual symbols. Using such high values leads to linker errors: $ make defconfig $ ./scripts/config -e PROVE_LOCKING --set-val LOCKDEP_BITS 30 $ make olddefconfig all [...] ld: kernel image bigger than KERNEL_IMAGE_SIZE ld: section .bss VMA wraps around address space Adjust the upper limits to the maximum values that avoid these issues. The need for anything more, likely points to a problem elsewhere. Note that LOCKDEP_CHAINS_BITS was intentionally left out as its upper limit had a different symptom and has already been fixed [1]. Reported-by: J. R. Okajima Closes: https://lore.kernel.org/all/30795.1620913191@jrobl/ [1] Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Waiman Long Cc: Will Deacon Acked-by: Waiman Long Signed-off-by: Carlos Llamas Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241024183631.643450-2-cmllamas@google.com --- lib/Kconfig.debug | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 49a3819d4d7c..7635b36ba060 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1504,7 +1504,7 @@ config LOCKDEP_SMALL config LOCKDEP_BITS int "Bitsize for MAX_LOCKDEP_ENTRIES" depends on LOCKDEP && !LOCKDEP_SMALL - range 10 30 + range 10 24 default 15 help Try increasing this value if you hit "BUG: MAX_LOCKDEP_ENTRIES too low!" message. @@ -1520,7 +1520,7 @@ config LOCKDEP_CHAINS_BITS config LOCKDEP_STACK_TRACE_BITS int "Bitsize for MAX_STACK_TRACE_ENTRIES" depends on LOCKDEP && !LOCKDEP_SMALL - range 10 30 + range 10 26 default 19 help Try increasing this value if you hit "BUG: MAX_STACK_TRACE_ENTRIES too low!" message. @@ -1528,7 +1528,7 @@ config LOCKDEP_STACK_TRACE_BITS config LOCKDEP_STACK_TRACE_HASH_BITS int "Bitsize for STACK_TRACE_HASH_SIZE" depends on LOCKDEP && !LOCKDEP_SMALL - range 10 30 + range 10 26 default 14 help Try increasing this value if you need large STACK_TRACE_HASH_SIZE. @@ -1536,7 +1536,7 @@ config LOCKDEP_STACK_TRACE_HASH_BITS config LOCKDEP_CIRCULAR_QUEUE_BITS int "Bitsize for elements in circular_queue struct" depends on LOCKDEP - range 10 30 + range 10 26 default 12 help Try increasing this value if you hit "lockdep bfs error:-1" warning due to __cq_enqueue() failure. From 88a79e88a97cb9309bb48a472be2bf1316d40adc Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 24 Oct 2024 18:36:27 +0000 Subject: [PATCH 135/224] lockdep: Clarify size for LOCKDEP_*_BITS configs The LOCKDEP_*_BITS configs control the size of internal structures used by lockdep. The size is calculated as a power of two of the configured value (e.g. 16 => 64KB). Update these descriptions to more accurately reflect this, as "Bitsize" can be misleading. Suggested-by: Andrew Morton Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Waiman Long Cc: Will Deacon Signed-off-by: Carlos Llamas Acked-by: Waiman Long Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241024183631.643450-3-cmllamas@google.com --- lib/Kconfig.debug | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7635b36ba060..cf2a41dc7682 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1502,7 +1502,7 @@ config LOCKDEP_SMALL bool config LOCKDEP_BITS - int "Bitsize for MAX_LOCKDEP_ENTRIES" + int "Size for MAX_LOCKDEP_ENTRIES (as Nth power of 2)" depends on LOCKDEP && !LOCKDEP_SMALL range 10 24 default 15 @@ -1510,7 +1510,7 @@ config LOCKDEP_BITS Try increasing this value if you hit "BUG: MAX_LOCKDEP_ENTRIES too low!" message. config LOCKDEP_CHAINS_BITS - int "Bitsize for MAX_LOCKDEP_CHAINS" + int "Size for MAX_LOCKDEP_CHAINS (as Nth power of 2)" depends on LOCKDEP && !LOCKDEP_SMALL range 10 21 default 16 @@ -1518,7 +1518,7 @@ config LOCKDEP_CHAINS_BITS Try increasing this value if you hit "BUG: MAX_LOCKDEP_CHAINS too low!" message. config LOCKDEP_STACK_TRACE_BITS - int "Bitsize for MAX_STACK_TRACE_ENTRIES" + int "Size for MAX_STACK_TRACE_ENTRIES (as Nth power of 2)" depends on LOCKDEP && !LOCKDEP_SMALL range 10 26 default 19 @@ -1526,7 +1526,7 @@ config LOCKDEP_STACK_TRACE_BITS Try increasing this value if you hit "BUG: MAX_STACK_TRACE_ENTRIES too low!" message. config LOCKDEP_STACK_TRACE_HASH_BITS - int "Bitsize for STACK_TRACE_HASH_SIZE" + int "Size for STACK_TRACE_HASH_SIZE (as Nth power of 2)" depends on LOCKDEP && !LOCKDEP_SMALL range 10 26 default 14 @@ -1534,7 +1534,7 @@ config LOCKDEP_STACK_TRACE_HASH_BITS Try increasing this value if you need large STACK_TRACE_HASH_SIZE. config LOCKDEP_CIRCULAR_QUEUE_BITS - int "Bitsize for elements in circular_queue struct" + int "Size for elements in circular_queue struct (as Nth power of 2)" depends on LOCKDEP range 10 26 default 12 From bd7b5ae26618ad2bd6f6264e2cb6c5815d323e75 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 24 Oct 2024 18:36:28 +0000 Subject: [PATCH 136/224] lockdep: Document MAX_LOCKDEP_CHAIN_HLOCKS calculation Define a macro AVG_LOCKDEP_CHAIN_DEPTH to document the magic number '5' used in the calculation of MAX_LOCKDEP_CHAIN_HLOCKS. The number represents the estimated average depth (number of locks held) of a lock chain. The calculation of MAX_LOCKDEP_CHAIN_HLOCKS was first added in commit 443cd507ce7f ("lockdep: add lock_class information to lock_chain and output it"). Suggested-by: Waiman Long Cc: Huang Ying Cc: J. R. Okajima Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Will Deacon Acked-by: Waiman Long Signed-off-by: Carlos Llamas Acked-by: "Huang, Ying" Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241024183631.643450-4-cmllamas@google.com --- kernel/locking/lockdep_internals.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index bbe9000260d0..20f9ef58d3d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -119,7 +119,8 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) +#define AVG_LOCKDEP_CHAIN_DEPTH 5 +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH) extern struct lock_chain lock_chains[]; From 41a1e976623eb430f7b5a8619d3810b44e6235ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Dec 2024 11:08:12 +0100 Subject: [PATCH 137/224] x86/mm: Convert unreachable() to BUG() Commit 2190966fbc14 ("x86: Convert unreachable() to BUG()") missed one. And after commit 06e24745985c ("objtool: Remove annotate_{,un}reachable()") the invalid use of unreachable() (rightfully) triggers warnings: vmlinux.o: warning: objtool: page_fault_oops() falls through to next function is_prefetch() Fixes: 2190966fbc14 ("x86: Convert unreachable() to BUG()") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241216093215.GD12338@noisy.programming.kicks-ass.net --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e6c469b323cc..ac52255fab01 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -678,7 +678,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); - unreachable(); + BUG(); } #endif From af98d8a36a963e758e84266d152b92c7b51d4ecb Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Thu, 12 Dec 2024 10:01:03 +0530 Subject: [PATCH 138/224] sched/fair: Fix CPU bandwidth limit bypass during CPU hotplug CPU controller limits are not properly enforced during CPU hotplug operations, particularly during CPU offline. When a CPU goes offline, throttled processes are unintentionally being unthrottled across all CPUs in the system, allowing them to exceed their assigned quota limits. Consider below for an example, Assigning 6.25% bandwidth limit to a cgroup in a 8 CPU system, where, workload is running 8 threads for 20 seconds at 100% CPU utilization, expected (user+sys) time = 10 seconds. $ cat /sys/fs/cgroup/test/cpu.max 50000 100000 $ ./ebizzy -t 8 -S 20 // non-hotplug case real 20.00 s user 10.81 s // intended behaviour sys 0.00 s $ ./ebizzy -t 8 -S 20 // hotplug case real 20.00 s user 14.43 s // Workload is able to run for 14 secs sys 0.00 s // when it should have only run for 10 secs During CPU hotplug, scheduler domains are rebuilt and cpu_attach_domain is called for every active CPU to update the root domain. That ends up calling rq_offline_fair which un-throttles any throttled hierarchies. Unthrottling should only occur for the CPU being hotplugged to allow its throttled processes to become runnable and get migrated to other CPUs. With current patch applied, $ ./ebizzy -t 8 -S 20 // hotplug case real 21.00 s user 10.16 s // intended behaviour sys 0.00 s This also has another symptom, when a CPU goes offline, and if the cfs_rq is not in throttled state and the runtime_remaining still had plenty remaining, it gets reset to 1 here, causing the runtime_remaining of cfs_rq to be quickly depleted. Note: hotplug operation (online, offline) was performed in while(1) loop v3: https://lore.kernel.org/all/20241210102346.228663-2-vishalc@linux.ibm.com v2: https://lore.kernel.org/all/20241207052730.1746380-2-vishalc@linux.ibm.com v1: https://lore.kernel.org/all/20241126064812.809903-2-vishalc@linux.ibm.com Suggested-by: Zhang Qiao Signed-off-by: Vishal Chourasia Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Tested-by: Madadi Vineeth Reddy Tested-by: Samir Mulani Link: https://lore.kernel.org/r/20241212043102.584863-2-vishalc@linux.ibm.com --- kernel/sched/fair.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c4ebfc82917..8f641c9e74a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6696,6 +6696,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + /* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating @@ -6710,19 +6714,21 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (!cfs_rq->runtime_enabled) continue; - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); From b8e10c86e674eb19e0e53dcf4fa3e71cba1e0c1c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:28 -0800 Subject: [PATCH 139/224] x86/cpu: Introduce new microcode matching helper The 'x86_cpu_id' and 'x86_cpu_desc' structures are very similar and need to be consolidated. There is a microcode version matching function for 'x86_cpu_desc' but not 'x86_cpu_id'. Create one for 'x86_cpu_id'. This essentially just leverages the x86_cpu_id->driver_data field to replace the less generic x86_cpu_desc->x86_microcode_rev field. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185128.8F24EEFC%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpu_device_id.h | 1 + arch/x86/kernel/cpu/match.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index e4121d9aa9e1..9c77dbee96a7 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -278,5 +278,6 @@ struct x86_cpu_desc { extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); +extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table); #endif /* _ASM_X86_CPU_DEVICE_ID */ diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 8e7de733320a..2de2a83a7e12 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -86,3 +86,14 @@ bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) return true; } EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); + +bool x86_match_min_microcode_rev(const struct x86_cpu_id *table) +{ + const struct x86_cpu_id *res = x86_match_cpu(table); + + if (!res || res->driver_data > boot_cpu_data.microcode) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev); From 85b08180df07b9a5984b15ae31d76b904d42a115 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:29 -0800 Subject: [PATCH 140/224] x86/cpu: Expose only stepping min/max interface The x86_match_cpu() infrastructure can match CPU steppings. Since there are only 16 possible steppings, the matching infrastructure goes all out and stores the stepping match as a bitmap. That means it can match any possible steppings in a single list entry. Fun. But it exposes this bitmap to each of the X86_MATCH_*() helpers when none of them really need a bitmap. It makes up for this by exporting a helper (X86_STEPPINGS()) which converts a contiguous stepping range into the bitmap which every single user leverages. Instead of a bitmap, have the main helper for this sort of thing (X86_MATCH_VFM_STEPS()) just take a stepping range. This ends up actually being even more compact than before. Leave the helper in place (renamed to __X86_STEPPINGS()) to make it more clear what is going on instead of just having a random GENMASK() in the middle of an already complicated macro. One oddity that I hit was this macro: X86_MATCH_VFM_STEPS(vfm, X86_STEPPING_MIN, max_stepping, issues) It *could* have been converted over to take a min/max stepping value for each entry. But that would have been a bit too verbose and would prevent the one oddball in the list (INTEL_COMETLAKE_L stepping 0) from sticking out. Instead, just have it take a *maximum* stepping and imply that the match is from 0=>max_stepping. This is functional for all the cases now and also retains the nice property of having INTEL_COMETLAKE_L stepping 0 stick out like a sore thumb. skx_cpuids[] is goofy. It uses the stepping match but encodes all possible steppings. Just use a normal, non-stepping match helper. Suggested-by: Ingo Molnar Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185129.65527B2A%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpu_device_id.h | 15 +++--- arch/x86/kernel/apic/apic.c | 18 +++---- arch/x86/kernel/cpu/common.c | 78 ++++++++++++++-------------- drivers/edac/i10nm_base.c | 21 ++++---- drivers/edac/skx_base.c | 2 +- include/linux/mod_devicetable.h | 2 + 6 files changed, 70 insertions(+), 66 deletions(-) diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 9c77dbee96a7..88564bb1845f 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -56,7 +56,6 @@ /* x86_cpu_id::flags */ #define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0) -#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY @@ -208,6 +207,7 @@ VFM_MODEL(vfm), \ X86_STEPPING_ANY, X86_FEATURE_ANY, data) +#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping * @vfm: Encoded 8-bits each for vendor, family, model @@ -218,12 +218,13 @@ * * feature is set to wildcard */ -#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \ - X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ - VFM_VENDOR(vfm), \ - VFM_FAMILY(vfm), \ - VFM_MODEL(vfm), \ - steppings, X86_FEATURE_ANY, data) +#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + __X86_STEPPINGS(min_step, max_step), \ + X86_FEATURE_ANY, data) /** * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c5fb28e6451a..b16bda1ffaab 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -509,19 +509,19 @@ static struct clock_event_device lapic_clockevent = { static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ - X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020), - X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), - X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), - X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), - X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003), - X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), - X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), - X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0), X86_MATCH_VFM(INTEL_HASWELL, 0x22), X86_MATCH_VFM(INTEL_HASWELL_L, 0x20), diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5c28975c608..d21b352bcd72 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1201,8 +1201,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define VULNBL(vendor, family, model, blacklist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) -#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \ - X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues) +#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \ + X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues) #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1227,43 +1227,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RFDS BIT(7) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 51556c72a967..09bf5a3f06bf 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -938,16 +938,17 @@ static struct res_config gnr_cfg = { }; static const struct x86_cpu_id i10nm_cpuids[] = { - X86_MATCH_VFM_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), - X86_MATCH_VFM_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), - X86_MATCH_VFM_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), - X86_MATCH_VFM_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), - X86_MATCH_VFM_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), - X86_MATCH_VFM_STEPPINGS(INTEL_SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), - X86_MATCH_VFM_STEPPINGS(INTEL_EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), - X86_MATCH_VFM_STEPPINGS(INTEL_GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), - X86_MATCH_VFM_STEPPINGS(INTEL_ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), - X86_MATCH_VFM_STEPPINGS(INTEL_ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_VFM_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MIN, 0x3, &i10nm_cfg0), + X86_MATCH_VFM_STEPS(INTEL_ATOM_TREMONT_D, 0x4, X86_STEP_MAX, &i10nm_cfg1), + X86_MATCH_VFM_STEPS(INTEL_ICELAKE_X, X86_STEP_MIN, 0x3, &i10nm_cfg0), + X86_MATCH_VFM_STEPS(INTEL_ICELAKE_X, 0x4, X86_STEP_MAX, &i10nm_cfg1), + X86_MATCH_VFM( INTEL_ICELAKE_D, &i10nm_cfg1), + + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_cfg), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_cfg), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_cfg), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_cfg), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 14cfd394b469..fed5ecb4b0b1 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -164,7 +164,7 @@ static struct res_config skx_cfg = { }; static const struct x86_cpu_id skx_cpuids[] = { - X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x0, 0xf), &skx_cfg), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg), { } }; MODULE_DEVICE_TABLE(x86cpu, skx_cpuids); diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4338b1b4ac44..d67614f7b7f1 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -700,6 +700,8 @@ struct x86_cpu_id { #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 #define X86_STEPPING_ANY 0 +#define X86_STEP_MIN 0 +#define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* From 3fa5626720c0948ce067306c4f6558d9ec86020c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:31 -0800 Subject: [PATCH 141/224] x86/cpu: Replace PEBS use of 'x86_cpu_desc' use with 'x86_cpu_id' The 'x86_cpu_desc' and 'x86_cpu_id' structures are very similar. Reduce duplicate infrastructure by moving the few users of 'x86_cpu_desc' to the much more common variant. The existing X86_MATCH_VFM_STEPS() helper matches ranges of steppings. Instead of introducing a single-stepping match function which could get confusing when paired with the range, just use the stepping min/max match helper and use min==max. Note that this makes the table more vertically compact because multiple entries like this: INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000), INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000), INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000), INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000), can be consolidated down to a single stepping range. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185131.8B610039%40davehans-spike.ostc.intel.com --- arch/x86/events/intel/core.c | 62 +++++++++++++++--------------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bb284aff7bfd..cd96013eea79 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5371,42 +5371,32 @@ static __init void intel_clovertown_quirk(void) x86_pmu.pebs_constraints = NULL; } -static const struct x86_cpu_desc isolation_ucodes[] = { - INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f), - INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e), - INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015), - INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037), - INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a), - INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023), - INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014), - INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010), - INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009), - INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009), - INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000), - INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000), - INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e), +static const struct x86_cpu_id isolation_ucodes[] = { + X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f), + X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e), + X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015), + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037), + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c), + X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e), + X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e), {} }; static void intel_check_pebs_isolation(void) { - x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes); + x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes); } static __init void intel_pebs_isolation_quirk(void) @@ -5416,16 +5406,16 @@ static __init void intel_pebs_isolation_quirk(void) intel_check_pebs_isolation(); } -static const struct x86_cpu_desc pebs_ucodes[] = { - INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028), - INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618), - INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c), +static const struct x86_cpu_id pebs_ucodes[] = { + X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028), + X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618), + X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c), {} }; static bool intel_snb_pebs_broken(void) { - return !x86_cpu_has_min_microcode_rev(pebs_ucodes); + return !x86_match_min_microcode_rev(pebs_ucodes); } static void intel_snb_check_microcode(void) From f3f3251526739bb975b97f840c56b3054dba8638 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:32 -0800 Subject: [PATCH 142/224] x86/cpu: Move AMD erratum 1386 table over to 'x86_cpu_id' The AMD erratum 1386 detection code uses and old style 'x86_cpu_desc' table. Replace it with 'x86_cpu_id' so the old style can be removed. I did not create a new helper macro here. The new table is certainly more noisy than the old and it can be improved on. But I was hesitant to create a new macro just for a single site that is only two ugly lines in the end. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185132.07555E1D%40davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/amd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d8408aafeed9..7bb5b1ad18c2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -795,10 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } -static const struct x86_cpu_desc erratum_1386_microcode[] = { - AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e), - AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052), - {}, +static const struct x86_cpu_id erratum_1386_microcode[] = { + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), }; static void fix_erratum_1386(struct cpuinfo_x86 *c) @@ -814,7 +813,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) * Clear the feature flag only on microcode revisions which * don't have the fix. */ - if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode)) + if (x86_match_min_microcode_rev(erratum_1386_microcode)) return; clear_cpu_cap(c, X86_FEATURE_XSAVES); From 5366d8965d35f0ea266c80e8970aa9527a9fee52 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:33 -0800 Subject: [PATCH 143/224] x86/cpu: Remove 'x86_cpu_desc' infrastructure All the users of 'x86_cpu_desc' are gone. Zap it from the tree. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185133.AF0BF2BC%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpu_device_id.h | 35 ---------------------------- arch/x86/kernel/cpu/match.c | 31 ------------------------ 2 files changed, 66 deletions(-) diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 88564bb1845f..ba32e0f44cba 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -243,42 +243,7 @@ VFM_MODEL(vfm), \ X86_STEPPING_ANY, feature, data) -/* - * Match specific microcode revisions. - * - * vendor/family/model/stepping must be all set. - * - * Only checks against the boot CPU. When mixed-stepping configs are - * valid for a CPU model, add a quirk for every valid stepping and - * do the fine-tuning in the quirk handler. - */ - -struct x86_cpu_desc { - u8 x86_family; - u8 x86_vendor; - u8 x86_model; - u8 x86_stepping; - u32 x86_microcode_rev; -}; - -#define INTEL_CPU_DESC(vfm, stepping, revision) { \ - .x86_family = VFM_FAMILY(vfm), \ - .x86_vendor = VFM_VENDOR(vfm), \ - .x86_model = VFM_MODEL(vfm), \ - .x86_stepping = (stepping), \ - .x86_microcode_rev = (revision), \ -} - -#define AMD_CPU_DESC(fam, model, stepping, revision) { \ - .x86_family = (fam), \ - .x86_vendor = X86_VENDOR_AMD, \ - .x86_model = (model), \ - .x86_stepping = (stepping), \ - .x86_microcode_rev = (revision), \ -} - extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); -extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table); #endif /* _ASM_X86_CPU_DEVICE_ID */ diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 2de2a83a7e12..1a714f70be84 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -56,37 +56,6 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) } EXPORT_SYMBOL(x86_match_cpu); -static const struct x86_cpu_desc * -x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - const struct x86_cpu_desc *m; - - for (m = match; m->x86_family | m->x86_model; m++) { - if (c->x86_vendor != m->x86_vendor) - continue; - if (c->x86 != m->x86_family) - continue; - if (c->x86_model != m->x86_model) - continue; - if (c->x86_stepping != m->x86_stepping) - continue; - return m; - } - return NULL; -} - -bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) -{ - const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); - - if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) - return false; - - return true; -} -EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); - bool x86_match_min_microcode_rev(const struct x86_cpu_id *table) { const struct x86_cpu_id *res = x86_match_cpu(table); From 497f70284695bbb9b875e182554ef3f18b4a56e2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:28 -0800 Subject: [PATCH 144/224] x86/cpu: Move MWAIT leaf definition to common header Begin constructing a common place to keep all CPUID leaf definitions. Move CPUID_MWAIT_LEAF to the CPUID header and include it where needed. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205028.EE94D02A%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpuid.h | 2 ++ arch/x86/include/asm/mwait.h | 1 - arch/x86/kernel/acpi/cstate.c | 1 + arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/smpboot.c | 1 + arch/x86/xen/enlighten_pv.c | 1 + drivers/acpi/acpi_pad.c | 1 + drivers/idle/intel_idle.c | 1 + 9 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 239b9ba5c398..13ecab94cc23 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -21,6 +21,8 @@ enum cpuid_regs_idx { CPUID_EDX, }; +#define CPUID_MWAIT_LEAF 5 + #ifdef CONFIG_X86_32 bool have_cpuid_p(void); #else diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 920426d691ce..ce857ef54cf1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -15,7 +15,6 @@ #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) #define MWAIT_C1_SUBSTATE_MASK 0xf0 -#define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f3ffd0a3a012..2779a93769e8 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,6 +13,7 @@ #include #include +#include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c96ae8fee95e..25935041fee1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ead05a1c29..d40fc4965c14 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b5a8f0891135..52b0d308eed9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,6 +64,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index fd2169063480..b355070e92fa 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -49,6 +49,7 @@ #include #include +#include #include #include #include diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 4ec20fd56985..b56197467352 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ac4d8faa3886..5d8ed1a68447 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include From 8bd6821c9cf3b81d3c07a94fa4e3f97a3cc7b724 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:29 -0800 Subject: [PATCH 145/224] x86/cpu: Use MWAIT leaf definition The leaf-to-feature dependency array uses hard-coded leaf numbers. Use the new common header definition for the MWAIT leaf. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205029.5B055D6E%40davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d21b352bcd72..853e373d2829 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -636,7 +637,7 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { - { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_MWAIT, CPUID_MWAIT_LEAF }, { X86_FEATURE_DCA, 0x00000009 }, { X86_FEATURE_XSAVE, 0x0000000d }, { 0, 0 } From 262fba55708b60a063b30d103963477dc5026f8c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:30 -0800 Subject: [PATCH 146/224] x86/cpu: Remove unnecessary MwAIT leaf checks The CPUID leaf dependency checker will remove X86_FEATURE_MWAIT if the CPUID level is below the required level (CPUID_MWAIT_LEAF). Thus, if you check X86_FEATURE_MWAIT you do not need to also check the CPUID level. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213205030.9B42B458%40davehans-spike.ostc.intel.com --- arch/x86/kernel/hpet.c | 3 --- arch/x86/kernel/smpboot.c | 2 -- drivers/acpi/acpi_pad.c | 2 -- drivers/idle/intel_idle.c | 3 --- 4 files changed, 10 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 25935041fee1..953de5b64669 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -928,9 +928,6 @@ static bool __init mwait_pc10_supported(void) if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) return false; - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 52b0d308eed9..116c46f9ecbb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1292,8 +1292,6 @@ static inline void mwait_play_dead(void) return; if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) - return; eax = CPUID_MWAIT_LEAF; ecx = 0; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index b56197467352..f3cffae0c14e 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -47,8 +47,6 @@ static void power_saving_mwait_init(void) if (!boot_cpu_has(X86_FEATURE_MWAIT)) return; - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return; cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 5d8ed1a68447..efa32d28ed93 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -2317,9 +2317,6 @@ static int __init intel_idle_init(void) return -ENODEV; } - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return -ENODEV; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || From 5d82d8e0a9ac06bfc6ac59407b96bc357eff441a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:32 -0800 Subject: [PATCH 147/224] x86/cpu: Refresh DCA leaf reading code The DCA leaf number is also hard-coded in the CPUID level dependency table. Move its definition to common code and use it. While at it, fix up the naming and types in the probe code. All CPUID data is provided in 32-bit registers, not 'unsigned long'. Also stop referring to "level_9". Move away from test_bit() because the type is no longer an 'unsigned long'. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205032.476A30FE%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpuid.h | 3 ++- arch/x86/kernel/cpu/common.c | 2 +- drivers/dma/ioat/dca.c | 8 +++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 13ecab94cc23..8ba4d9fdc9e7 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -21,7 +21,8 @@ enum cpuid_regs_idx { CPUID_EDX, }; -#define CPUID_MWAIT_LEAF 5 +#define CPUID_MWAIT_LEAF 0x5 +#define CPUID_DCA_LEAF 0x9 #ifdef CONFIG_X86_32 bool have_cpuid_p(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 853e373d2829..5ffa1f4eac38 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -638,7 +638,7 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, CPUID_MWAIT_LEAF }, - { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_DCA, CPUID_DCA_LEAF }, { X86_FEATURE_XSAVE, 0x0000000d }, { 0, 0 } }; diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c index 17f6b6367113..658ea2ec36f7 100644 --- a/drivers/dma/ioat/dca.c +++ b/drivers/dma/ioat/dca.c @@ -10,6 +10,8 @@ #include #include +#include + /* either a kernel change is needed, or we need something like this in kernel */ #ifndef CONFIG_SMP #include @@ -58,11 +60,11 @@ static int dca_enabled_in_bios(struct pci_dev *pdev) { /* CPUID level 9 returns DCA configuration */ /* Bit 0 indicates DCA enabled by the BIOS */ - unsigned long cpuid_level_9; + u32 eax; int res; - cpuid_level_9 = cpuid_eax(9); - res = test_bit(0, &cpuid_level_9); + eax = cpuid_eax(CPUID_DCA_LEAF); + res = eax & BIT(0); if (!res) dev_dbg(&pdev->dev, "DCA is disabled in BIOS\n"); From a86740a77bf0942e618cc5f022336cdd99530d10 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:33 -0800 Subject: [PATCH 148/224] x86/cpu: Move TSC CPUID leaf definition Prepare to use the TSC CPUID leaf definition more widely by moving it to the common header. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205033.68799E53%40davehans-spike.ostc.intel.com --- arch/x86/events/intel/pt.c | 1 + arch/x86/events/intel/pt.h | 3 --- arch/x86/include/asm/cpuid.h | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 4b0373bc8ab4..608145566fca 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 7ee94fc6d7cb..2ac36250b656 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -37,9 +37,6 @@ struct topa_entry { u64 rsvd4 : 12; }; -/* TSC to Core Crystal Clock Ratio */ -#define CPUID_TSC_LEAF 0x15 - struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 8ba4d9fdc9e7..9b0d14bfd2f2 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -23,6 +23,7 @@ enum cpuid_regs_idx { #define CPUID_MWAIT_LEAF 0x5 #define CPUID_DCA_LEAF 0x9 +#define CPUID_TSC_LEAF 0x15 #ifdef CONFIG_X86_32 bool have_cpuid_p(void); From 030c15b5610cedf7eb428dab5382f73d492a7967 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:34 -0800 Subject: [PATCH 149/224] x86/tsc: Move away from TSC leaf magic numbers The TSC code has a bunch of hard-coded references to leaf 0x15. Change them over to the symbolic name. Also zap the 'ART_CPUID_LEAF' definition. It was a duplicate of 'CPUID_TSC_LEAF'. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213205034.B79D6224%40davehans-spike.ostc.intel.com --- arch/x86/kernel/tsc.c | 11 +++++------ drivers/platform/x86/intel/pmc/core.c | 7 ++++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 67aeaba4ba9c..8091b0ea7de2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -665,13 +666,13 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x15) + if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_TSC_LEAF, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; @@ -1067,10 +1068,8 @@ core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ -#define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) - /* * If ART is present detect the numerator:denominator to convert to TSC */ @@ -1078,7 +1077,7 @@ static void __init detect_art(void) { unsigned int unused; - if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) return; /* @@ -1091,7 +1090,7 @@ static void __init detect_art(void) tsc_async_resets) return; - cpuid(ART_CPUID_LEAF, &art_base_clk.denominator, + cpuid(CPUID_TSC_LEAF, &art_base_clk.denominator, &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); art_base_clk.freq_khz /= KHZ; diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 3e7f99ac8c94..ac8231e2f0c6 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -935,13 +936,13 @@ static unsigned int pmc_core_get_crystal_freq(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; - if (boot_cpu_data.cpuid_level < 0x15) + if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; - /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + /* TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(CPUID_TSC_LEAF, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; From e558eadf6bd6199a3f454299de3c6338931d4e46 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:36 -0800 Subject: [PATCH 150/224] x86/tsc: Remove CPUID "frequency" leaf magic numbers. All the code that reads the CPUID frequency information leaf hard-codes a magic number. Give it a symbolic name and use it. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205036.4397658F%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpuid.h | 1 + arch/x86/kernel/tsc.c | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 9b0d14bfd2f2..e7803c21a3fd 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -24,6 +24,7 @@ enum cpuid_regs_idx { #define CPUID_MWAIT_LEAF 0x5 #define CPUID_DCA_LEAF 0x9 #define CPUID_TSC_LEAF 0x15 +#define CPUID_FREQ_LEAF 0x16 #ifdef CONFIG_X86_32 bool have_cpuid_p(void); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8091b0ea7de2..678c36f5cd4a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -681,8 +681,8 @@ unsigned long native_calibrate_tsc(void) /* * Denverton SoCs don't report crystal clock, and also don't support - * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal - * clock. + * CPUID_FREQ_LEAF for the calculation below, so hardcode the 25MHz + * crystal clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) @@ -701,10 +701,10 @@ unsigned long native_calibrate_tsc(void) * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ - if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_FREQ_LEAF) { unsigned int eax_base_mhz, ebx, ecx, edx; - cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + cpuid(CPUID_FREQ_LEAF, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } @@ -739,12 +739,12 @@ static unsigned long cpu_khz_from_cpuid(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x16) + if (boot_cpu_data.cpuid_level < CPUID_FREQ_LEAF) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; - cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + cpuid(CPUID_FREQ_LEAF, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } From 754aaac3bbf13bdbbed9da94b56f371e90fd9c96 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:37 -0800 Subject: [PATCH 151/224] x86/fpu: Move CPUID leaf definitions to common code Move the XSAVE-related CPUID leaf definitions to common code. Then, use the new definition to remove the last magic number from the CPUID level dependency table. Signed-off-by: Dave Hansen Reviewed-by: Zhao Liu Link: https://lore.kernel.org/all/20241213205037.43C57CDE%40davehans-spike.ostc.intel.com --- arch/x86/include/asm/cpuid.h | 2 ++ arch/x86/include/asm/fpu/xstate.h | 4 ---- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/fpu/xstate.c | 1 + 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index e7803c21a3fd..a86097eb26c9 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -23,8 +23,10 @@ enum cpuid_regs_idx { #define CPUID_MWAIT_LEAF 0x5 #define CPUID_DCA_LEAF 0x9 +#define XSTATE_CPUID 0x0d #define CPUID_TSC_LEAF 0x15 #define CPUID_FREQ_LEAF 0x16 +#define TILE_CPUID 0x1d #ifdef CONFIG_X86_32 bool have_cpuid_p(void); diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index d4427b88ee12..7f39fe7980c5 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -12,10 +12,6 @@ /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) -#define XSTATE_CPUID 0x0000000d - -#define TILE_CPUID 0x0000001d - #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5ffa1f4eac38..f5c33e155f98 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -639,7 +639,7 @@ static const struct cpuid_dependent_feature cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, CPUID_MWAIT_LEAF }, { X86_FEATURE_DCA, CPUID_DCA_LEAF }, - { X86_FEATURE_XSAVE, 0x0000000d }, + { X86_FEATURE_XSAVE, XSTATE_CPUID }, { 0, 0 } }; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 22abb5ee0cf2..bf38b3e75425 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include From 588e148d8babeb2fd863fb152b80548e18971caf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:38 -0800 Subject: [PATCH 152/224] x86/fpu: Remove unnecessary CPUID level check The CPUID level dependency table will entirely zap X86_FEATURE_XSAVE if the CPUID level is too low. This code is unreachable. Kill it. Signed-off-by: Dave Hansen Reviewed-by: Chang S. Bae Link: https://lore.kernel.org/all/20241213205038.6E71F9A4%40davehans-spike.ostc.intel.com --- arch/x86/kernel/fpu/xstate.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bf38b3e75425..edacd34c1a22 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -764,11 +764,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) return; } - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN_ON_FPU(1); - return; - } - /* * Find user xstates supported by the processor. */ From e5d3a57891ba500503df075b99b78d6e61f2694e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 12:50:40 -0800 Subject: [PATCH 153/224] x86/cpu: Make all all CPUID leaf names consistent The leaf names are not consistent. Give them all a CPUID_LEAF_ prefix for consistency and vertical alignment. Signed-off-by: Dave Hansen Acked-by: Dave Jiang # for ioatdma bits Link: https://lore.kernel.org/all/20241213205040.7B0C3241%40davehans-spike.ostc.intel.com --- arch/x86/events/intel/pt.c | 4 ++-- arch/x86/include/asm/cpuid.h | 12 ++++++------ arch/x86/kernel/acpi/cstate.c | 4 ++-- arch/x86/kernel/cpu/common.c | 6 +++--- arch/x86/kernel/fpu/xstate.c | 20 ++++++++++---------- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tsc.c | 18 +++++++++--------- arch/x86/xen/enlighten_pv.c | 4 ++-- drivers/acpi/acpi_pad.c | 2 +- drivers/dma/ioat/dca.c | 2 +- drivers/idle/intel_idle.c | 2 +- drivers/platform/x86/intel/pmc/core.c | 4 ++-- 14 files changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 608145566fca..fa37565f6418 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -202,10 +202,10 @@ static int __init pt_pmu_hw_init(void) * otherwise, zero for numerator stands for "not enumerated" * as per SDM */ - if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) { + if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) { u32 eax, ebx, ecx, edx; - cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx); pt_pmu.tsc_art_num = ebx; pt_pmu.tsc_art_den = eax; diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index a86097eb26c9..b2b9b4ef3dae 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -21,12 +21,12 @@ enum cpuid_regs_idx { CPUID_EDX, }; -#define CPUID_MWAIT_LEAF 0x5 -#define CPUID_DCA_LEAF 0x9 -#define XSTATE_CPUID 0x0d -#define CPUID_TSC_LEAF 0x15 -#define CPUID_FREQ_LEAF 0x16 -#define TILE_CPUID 0x1d +#define CPUID_LEAF_MWAIT 0x5 +#define CPUID_LEAF_DCA 0x9 +#define CPUID_LEAF_XSTATE 0x0d +#define CPUID_LEAF_TSC 0x15 +#define CPUID_LEAF_FREQ 0x16 +#define CPUID_LEAF_TILE 0x1d #ifdef CONFIG_X86_32 bool have_cpuid_p(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 2779a93769e8..5854f0b8f0f1 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -129,7 +129,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) unsigned int cstate_type; /* C-state type and not ACPI C-state type */ unsigned int num_cstate_subtype; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* Check whether this particular cx_type (in CST) is supported or not */ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) & @@ -173,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); long retval; - if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT) return -1; if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f5c33e155f98..2bdb9e032892 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -637,9 +637,9 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { - { X86_FEATURE_MWAIT, CPUID_MWAIT_LEAF }, - { X86_FEATURE_DCA, CPUID_DCA_LEAF }, - { X86_FEATURE_XSAVE, XSTATE_CPUID }, + { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT }, + { X86_FEATURE_DCA, CPUID_LEAF_DCA }, + { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE }, { 0, 0 } }; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index edacd34c1a22..27417b685c1d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -233,7 +233,7 @@ static void __init setup_xstate_cache(void) xmm_space); for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; xstate_flags[i] = ecx; @@ -399,7 +399,7 @@ int xfeature_size(int xfeature_nr) u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } @@ -442,9 +442,9 @@ static void __init __xstate_dump_leaves(void) * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", - XSTATE_CPUID, i, eax, ebx, ecx, edx); + CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } @@ -485,7 +485,7 @@ static int __init check_xtile_data_against_struct(int size) * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ - cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of @@ -499,7 +499,7 @@ static int __init check_xtile_data_against_struct(int size) * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ - cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; @@ -634,7 +634,7 @@ static unsigned int __init get_compacted_size(void) * are no supervisor states, but XSAVEC still uses compacted * format. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } @@ -675,7 +675,7 @@ static unsigned int __init get_xsave_size_user(void) * containing all the *user* state components * corresponding to bits currently set in XCR0. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } @@ -767,13 +767,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) /* * Find user xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 953de5b64669..2b1a62baca55 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -928,7 +928,7 @@ static bool __init mwait_pc10_supported(void) if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) return false; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates); return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx & CPUID5_ECX_INTERRUPT_BREAK) && diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d40fc4965c14..69f786791f1a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -878,7 +878,7 @@ static __init bool prefer_mwait_c1_over_halt(void) if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) return false; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 116c46f9ecbb..0e3f9bad0395 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1293,7 +1293,7 @@ static inline void mwait_play_dead(void) if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; - eax = CPUID_MWAIT_LEAF; + eax = CPUID_LEAF_MWAIT; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 678c36f5cd4a..a85594644e13 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -666,13 +666,13 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(CPUID_TSC_LEAF, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; @@ -681,7 +681,7 @@ unsigned long native_calibrate_tsc(void) /* * Denverton SoCs don't report crystal clock, and also don't support - * CPUID_FREQ_LEAF for the calculation below, so hardcode the 25MHz + * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz * crystal clock. */ if (crystal_khz == 0 && @@ -701,10 +701,10 @@ unsigned long native_calibrate_tsc(void) * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ - if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_FREQ_LEAF) { + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { unsigned int eax_base_mhz, ebx, ecx, edx; - cpuid(CPUID_FREQ_LEAF, &eax_base_mhz, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } @@ -739,12 +739,12 @@ static unsigned long cpu_khz_from_cpuid(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < CPUID_FREQ_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; - cpuid(CPUID_FREQ_LEAF, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } @@ -1077,7 +1077,7 @@ static void __init detect_art(void) { unsigned int unused; - if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; /* @@ -1090,7 +1090,7 @@ static void __init detect_art(void) tsc_async_resets) return; - cpuid(CPUID_TSC_LEAF, &art_base_clk.denominator, + cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); art_base_clk.freq_khz /= KHZ; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index b355070e92fa..55727d5c4da9 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -231,7 +231,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, or_ebx = smp_processor_id() << 24; break; - case CPUID_MWAIT_LEAF: + case CPUID_LEAF_MWAIT: /* Synthesize the values.. */ *ax = 0; *bx = 0; @@ -301,7 +301,7 @@ static bool __init xen_check_mwait(void) * ecx and edx. The hypercall provides only partial information. */ - ax = CPUID_MWAIT_LEAF; + ax = CPUID_LEAF_MWAIT; bx = 0; cx = 0; dx = 0; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index f3cffae0c14e..3fde4496f8a2 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -48,7 +48,7 @@ static void power_saving_mwait_init(void) if (!boot_cpu_has(X86_FEATURE_MWAIT)) return; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c index 658ea2ec36f7..c9aba2304de7 100644 --- a/drivers/dma/ioat/dca.c +++ b/drivers/dma/ioat/dca.c @@ -63,7 +63,7 @@ static int dca_enabled_in_bios(struct pci_dev *pdev) u32 eax; int res; - eax = cpuid_eax(CPUID_DCA_LEAF); + eax = cpuid_eax(CPUID_LEAF_DCA); res = eax & BIT(0); if (!res) dev_dbg(&pdev->dev, "DCA is disabled in BIOS\n"); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index efa32d28ed93..239ce0d046f3 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -2317,7 +2317,7 @@ static int __init intel_idle_init(void) return -ENODEV; } - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates); if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index ac8231e2f0c6..10f04b944117 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -936,13 +936,13 @@ static unsigned int pmc_core_get_crystal_freq(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; - if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(CPUID_TSC_LEAF, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; From e8aa393b0ada3b5ce1b3e8475b02e90e5dce6841 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 15 Dec 2024 04:15:42 +1300 Subject: [PATCH 154/224] x86/virt/tdx: Rename 'struct tdx_tdmr_sysinfo' to reflect the spec better The TDX module provides a set of "Global Metadata Fields". They report things like TDX module version, supported features, and fields related to create/run TDX guests and so on. TDX organizes those metadata fields by "Classes" based on the meaning of those fields. E.g., for now the kernel only reads "TD Memory Region" (TDMR) related fields for module initialization. Those fields are defined under class "TDMR Info". Today the kernel reads some of the global metadata to initialize the TDX module. KVM will need to read additional metadata fields to run TDX guests. Move towards having the TDX host core-kernel provide a centralized, canonical, and immutable structure for the global metadata that comes out from the TDX module for all kernel components to use. More specifically, prepare the code to end up with an organization like: struct tdx_sys_info { struct tdx_sys_info_classA a; struct tdx_sys_info_classB b; ... }; Currently the kernel organizes all fields under "TDMR Info" class in 'struct tdx_tdmr_sysinfo'. Prepare for the above by renaming the structure to 'struct tdx_sys_info_tdmr' to follow the class name better. No functional change intended. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Adrian Hunter Reviewed-by: Dan Williams Link: https://lore.kernel.org/all/de165d09e0b571cfeb119a368f4be6e2888ebb93.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 36 ++++++++++++++++++------------------ arch/x86/virt/vmx/tdx/tdx.h | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 4e2b2e2ac9f9..e979bf442929 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -272,7 +272,7 @@ static int read_sys_metadata_field(u64 field_id, u64 *data) static int read_sys_metadata_field16(u64 field_id, int offset, - struct tdx_tdmr_sysinfo *ts) + struct tdx_sys_info_tdmr *ts) { u16 *ts_member = ((void *)ts) + offset; u64 tmp; @@ -298,9 +298,9 @@ struct field_mapping { #define TD_SYSINFO_MAP(_field_id, _offset) \ { .field_id = MD_FIELD_ID_##_field_id, \ - .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) } + .offset = offsetof(struct tdx_sys_info_tdmr, _offset) } -/* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */ +/* Map TD_SYSINFO fields into 'struct tdx_sys_info_tdmr': */ static const struct field_mapping fields[] = { TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), @@ -309,16 +309,16 @@ static const struct field_mapping fields[] = { TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]), }; -static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo) +static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) { int ret; int i; - /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */ + /* Populate 'sysinfo_tdmr' fields using the mapping structure above: */ for (i = 0; i < ARRAY_SIZE(fields); i++) { ret = read_sys_metadata_field16(fields[i].field_id, fields[i].offset, - tdmr_sysinfo); + sysinfo_tdmr); if (ret) return ret; } @@ -342,13 +342,13 @@ static int tdmr_size_single(u16 max_reserved_per_tdmr) } static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, - struct tdx_tdmr_sysinfo *tdmr_sysinfo) + struct tdx_sys_info_tdmr *sysinfo_tdmr) { size_t tdmr_sz, tdmr_array_sz; void *tdmr_array; - tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr); - tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs; + tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr); + tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs; /* * To keep things simple, allocate all TDMRs together. @@ -367,7 +367,7 @@ static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, * at a given index in the TDMR list. */ tdmr_list->tdmr_sz = tdmr_sz; - tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs; + tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs; tdmr_list->nr_consumed_tdmrs = 0; return 0; @@ -921,11 +921,11 @@ static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, /* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module - * TDMR global information in @tdmr_sysinfo. + * TDMR global information in @sysinfo_tdmr. */ static int construct_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list, - struct tdx_tdmr_sysinfo *tdmr_sysinfo) + struct tdx_sys_info_tdmr *sysinfo_tdmr) { int ret; @@ -934,12 +934,12 @@ static int construct_tdmrs(struct list_head *tmb_list, return ret; ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, - tdmr_sysinfo->pamt_entry_size); + sysinfo_tdmr->pamt_entry_size); if (ret) return ret; ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, - tdmr_sysinfo->max_reserved_per_tdmr); + sysinfo_tdmr->max_reserved_per_tdmr); if (ret) tdmrs_free_pamt_all(tdmr_list); @@ -1098,7 +1098,7 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) static int init_tdx_module(void) { - struct tdx_tdmr_sysinfo tdmr_sysinfo; + struct tdx_sys_info_tdmr sysinfo_tdmr; int ret; /* @@ -1117,17 +1117,17 @@ static int init_tdx_module(void) if (ret) goto out_put_tdxmem; - ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo); + ret = get_tdx_sys_info_tdmr(&sysinfo_tdmr); if (ret) goto err_free_tdxmem; /* Allocate enough space for constructing TDMRs */ - ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo); + ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo_tdmr); if (ret) goto err_free_tdxmem; /* Cover all TDX-usable memory regions in TDMRs */ - ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo); + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo_tdmr); if (ret) goto err_free_tdmrs; diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index b701f69485d3..148f9b4d1140 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -100,7 +100,7 @@ struct tdx_memblock { }; /* "TDMR info" part of "Global Scope Metadata" for constructing TDMRs */ -struct tdx_tdmr_sysinfo { +struct tdx_sys_info_tdmr { u16 max_tdmrs; u16 max_reserved_per_tdmr; u16 pamt_entry_size[TDX_PS_NR]; From c4e0862a62c059498000914305ae60f9cbd0818a Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 15 Dec 2024 04:15:43 +1300 Subject: [PATCH 155/224] x86/virt/tdx: Start to track all global metadata in one structure The TDX module provides a set of "Global Metadata Fields". They report things like TDX module version, supported features, and fields related to create/run TDX guests and so on. Today the kernel only reads "TD Memory Region" (TDMR) related fields for module initialization. KVM will need to read additional metadata fields to run TDX guests. Move towards having the TDX host core-kernel provide a centralized, canonical, and immutable structure for the global metadata that comes out from the TDX module for all kernel components to use. As the first step, introduce a new 'struct tdx_sys_info' to track all global metadata fields. TDX categorizes global metadata fields into different "Classes". E.g., the TDMR related fields are under class "TDMR Info". Instead of making 'struct tdx_sys_info' a plain structure to contain all metadata fields, organize them in smaller structures based on the "Class". This allows those metadata fields to be used in finer granularity thus makes the code clearer. E.g., construct_tdmrs() can just take the structure which contains "TDMR Info" metadata fields. Add get_tdx_sys_info() as the placeholder to read all metadata fields. Have it only call get_tdx_sys_info_tdmr() to read TDMR related fields for now. Place get_tdx_sys_info() as the first step of init_tdx_module() to enable early prerequisite checks on the metadata to support early module initialization abort. This results in moving get_tdx_sys_info_tdmr() to be before build_tdx_memlist(), but this is fine because there are no dependencies between these two functions. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Adrian Hunter Reviewed-by: Dan Williams Link: https://lore.kernel.org/all/bfacb4e90527cf79d4be0d1753e6f318eea21118.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 19 ++++++++++++------- arch/x86/virt/vmx/tdx/tdx.h | 19 ++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index e979bf442929..7a2f979092e7 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -326,6 +326,11 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) return 0; } +static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) +{ + return get_tdx_sys_info_tdmr(&sysinfo->tdmr); +} + /* Calculate the actual TDMR size */ static int tdmr_size_single(u16 max_reserved_per_tdmr) { @@ -1098,9 +1103,13 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) static int init_tdx_module(void) { - struct tdx_sys_info_tdmr sysinfo_tdmr; + struct tdx_sys_info sysinfo; int ret; + ret = get_tdx_sys_info(&sysinfo); + if (ret) + return ret; + /* * To keep things simple, assume that all TDX-protected memory * will come from the page allocator. Make sure all pages in the @@ -1117,17 +1126,13 @@ static int init_tdx_module(void) if (ret) goto out_put_tdxmem; - ret = get_tdx_sys_info_tdmr(&sysinfo_tdmr); - if (ret) - goto err_free_tdxmem; - /* Allocate enough space for constructing TDMRs */ - ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo_tdmr); + ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr); if (ret) goto err_free_tdxmem; /* Cover all TDX-usable memory regions in TDMRs */ - ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo_tdmr); + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr); if (ret) goto err_free_tdmrs; diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 148f9b4d1140..2600ec3752f5 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -80,6 +80,18 @@ struct tdmr_info { DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); } __packed __aligned(TDMR_INFO_ALIGNMENT); +/* Class "TDMR info" */ +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_entry_size[TDX_PS_NR]; +}; + +/* Kernel used global metadata fields */ +struct tdx_sys_info { + struct tdx_sys_info_tdmr tdmr; +}; + /* * Do not put any hardware-defined TDX structure representations below * this comment! @@ -99,13 +111,6 @@ struct tdx_memblock { int nid; }; -/* "TDMR info" part of "Global Scope Metadata" for constructing TDMRs */ -struct tdx_sys_info_tdmr { - u16 max_tdmrs; - u16 max_reserved_per_tdmr; - u16 pamt_entry_size[TDX_PS_NR]; -}; - /* Warn if kernel has less than TDMR_NR_WARN TDMRs after allocation */ #define TDMR_NR_WARN 4 From 04a7bc7316b8b9ea5564ea66eb65155203f1541f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 15 Dec 2024 04:15:44 +1300 Subject: [PATCH 156/224] x86/virt/tdx: Use auto-generated code to read global metadata The TDX module provides a set of "Global Metadata Fields". They report things like TDX module version, supported features, and fields related to create/run TDX guests and so on. Currently the kernel only reads "TD Memory Region" (TDMR) related fields for module initialization. There are needs to read more global metadata fields for future use: - Supported features ("TDX_FEATURES0") to fail module initialization when the module doesn't support "not clobbering host RBP when exiting from TDX guest" feature [1]. - KVM TDX baseline support and other features like TDX Connect will need to read more. The current global metadata reading code has limitations (e.g., it only has a primitive helper to read metadata field with 16-bit element size, while TDX supports 8/16/32/64 bits metadata element sizes). It needs tweaks in order to read more metadata fields. But even with the tweaks, when new code is added to read a new field, the reviewers will still need to review against the spec to make sure the new code doesn't screw up things like using the wrong metadata field ID (each metadata field is associated with a unique field ID, which is a TDX-defined u64 constant) etc. TDX documents all global metadata fields in a 'global_metadata.json' file as part of TDX spec [2]. JSON format is machine readable. Instead of tweaking the metadata reading code, use a script to generate the code so that: 1) Using the generated C is simple. 2) Adding a field is simple, e.g., the script just pulls the field ID out of the JSON for a given field thus no manual review is needed. Specifically, to match the layout of the 'struct tdx_sys_info' and its sub-structures, the script uses a table with each entry containing the the name of the sub-structures (which reflects the "Class") and the "Field Name" of all its fields, and auto-generate: 1) The 'struct tdx_sys_info' and all 'struct tdx_sys_info_xx' sub-structures in 'tdx_global_metadata.h'. 2) The main function 'get_tdx_sys_info()' which reads all metadata to 'struct tdx_sys_info' and the 'get_tdx_sys_info_xx()' functions which read 'struct tdx_sys_info_xx()' in 'tdx_global_metadata.c'. Using the generated C is simple: 1) include "tdx_global_metadata.h" to the local "tdx.h"; 2) explicitly include "tdx_global_metadata.c" to the local "tdx.c" after the read_sys_metadata_field() primitive (which is a wrapper of TDH.SYS.RD SEAMCALL to read global metadata). Adding a field is also simple: 1) just add the new field to an existing structure, or add it with a new structure; 2) re-run the script to generate the new code; 3) update the existing tdx_global_metadata.{hc} with the new ones. For now, use the auto-generated code to read the TDMR related fields and the aforesaid metadata field "TDX_FEATURES0". The tdx_global_metadata.{hc} can be generated by running below: #python tdx_global_metadata.py global_metadata.json \ tdx_global_metadata.h tdx_global_metadata.c .. where the 'global_metadata.json' can be fetched from [2] and the 'tdx_global_metadata.py' can be found from [3]. Co-developed-by: Kai Huang Signed-off-by: Paolo Bonzini Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/fc0e8ab7-86d4-4428-be31-82e1ece6dd21@intel.com/ [1] Link: https://cdrdv2.intel.com/v1/dl/getContent/795381 [2] Link: https://lore.kernel.org/762a50133300710771337398284567b299a86f67.camel@intel.com/ [3] Link: https://lore.kernel.org/all/cbe3f12b1e5479399b53f4873f2ff783d9fc669b.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 48 +++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 25 +++++++++++ 2 files changed, 73 insertions(+) create mode 100644 arch/x86/virt/vmx/tdx/tdx_global_metadata.c create mode 100644 arch/x86/virt/vmx/tdx/tdx_global_metadata.h diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c new file mode 100644 index 000000000000..8027a24d1c6e --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Automatically generated functions to read TDX global metadata. + * + * This file doesn't compile on its own as it lacks of inclusion + * of SEAMCALL wrapper primitive which reads global metadata. + * Include this file to other C file instead. + */ + +static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x0A00000300000008, &val))) + sysinfo_features->tdx_features0 = val; + + return ret; +} + +static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000008, &val))) + sysinfo_tdmr->max_tdmrs = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000009, &val))) + sysinfo_tdmr->max_reserved_per_tdmr = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000010, &val))) + sysinfo_tdmr->pamt_4k_entry_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000011, &val))) + sysinfo_tdmr->pamt_2m_entry_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9100000100000012, &val))) + sysinfo_tdmr->pamt_1g_entry_size = val; + + return ret; +} + +static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) +{ + int ret = 0; + + ret = ret ?: get_tdx_sys_info_features(&sysinfo->features); + ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr); + + return ret; +} diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h new file mode 100644 index 000000000000..6dd3c9695f59 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; +}; + +#endif From 6bfb77f4893f9809fd1dc3591c8b343534c87a65 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 15 Dec 2024 04:15:45 +1300 Subject: [PATCH 157/224] x86/virt/tdx: Use dedicated struct members for PAMT entry sizes Currently, the 'struct tdmr_sys_info_tdmr' which includes TDMR related fields defines the PAMT entry sizes for TDX supported page sizes (4KB, 2MB and 1GB) as an array: struct tdx_sys_info_tdmr { ... u16 pamt_entry_sizes[TDX_PS_NR]; }; PAMT entry sizes are needed when allocating PAMTs for each TDMR. Using the array to contain PAMT entry sizes reduces the number of arguments that need to be passed when calling tdmr_set_up_pamt(). It also makes the code pattern like below clearer: for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, pamt_entry_size[pgsz]); tdmr_pamt_size += pamt_size[pgsz]; } However, the auto-generated metadata reading code generates a structure member for each field. The 'global_metadata.json' has a dedicated field for each PAMT entry size, and the new 'struct tdx_sys_info_tdmr' looks like: struct tdx_sys_info_tdmr { ... u16 pamt_4k_entry_size; u16 pamt_2m_entry_size; u16 pamt_1g_entry_size; }; Prepare to use the autogenerated code by making the existing 'struct tdx_sys_info_tdmr' look like the generated one. When passing to tdmrs_set_up_pamt_all(), build a local array of PAMT entry sizes from the structure so the code to allocate PAMTs can stay the same. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Reviewed-by: Dan Williams Link: https://lore.kernel.org/all/ccf46f3dacb01be1fb8309592616d443ac17caba.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 14 +++++++++----- arch/x86/virt/vmx/tdx/tdx.h | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 7a2f979092e7..28537a6c47fc 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -304,9 +304,9 @@ struct field_mapping { static const struct field_mapping fields[] = { TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), - TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]), - TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]), - TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]), + TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_4k_entry_size), + TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_2m_entry_size), + TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_1g_entry_size), }; static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) @@ -932,14 +932,18 @@ static int construct_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list, struct tdx_sys_info_tdmr *sysinfo_tdmr) { + u16 pamt_entry_size[TDX_PS_NR] = { + sysinfo_tdmr->pamt_4k_entry_size, + sysinfo_tdmr->pamt_2m_entry_size, + sysinfo_tdmr->pamt_1g_entry_size, + }; int ret; ret = fill_out_tdmrs(tmb_list, tdmr_list); if (ret) return ret; - ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, - sysinfo_tdmr->pamt_entry_size); + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size); if (ret) return ret; diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 2600ec3752f5..ec879d54eb5c 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -84,7 +84,9 @@ struct tdmr_info { struct tdx_sys_info_tdmr { u16 max_tdmrs; u16 max_reserved_per_tdmr; - u16 pamt_entry_size[TDX_PS_NR]; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; }; /* Kernel used global metadata fields */ From fae43b24a6ba8f3def312af371ed86d8ce85e11b Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 15 Dec 2024 04:15:46 +1300 Subject: [PATCH 158/224] x86/virt/tdx: Switch to use auto-generated global metadata reading code Continue the process to have a centralized solution for TDX global metadata reading. Now that the new autogenerated solution is ready for use, switch to it and remove the old one. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Reviewed-by: Dan Williams Link: https://lore.kernel.org/all/fc025d1e13b92900323f47cfe9aac3157bf08ee7.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 61 +------------------------------------ arch/x86/virt/vmx/tdx/tdx.h | 45 +-------------------------- 2 files changed, 2 insertions(+), 104 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 28537a6c47fc..43ec56db5084 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -270,66 +270,7 @@ static int read_sys_metadata_field(u64 field_id, u64 *data) return 0; } -static int read_sys_metadata_field16(u64 field_id, - int offset, - struct tdx_sys_info_tdmr *ts) -{ - u16 *ts_member = ((void *)ts) + offset; - u64 tmp; - int ret; - - if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) != - MD_FIELD_ID_ELE_SIZE_16BIT)) - return -EINVAL; - - ret = read_sys_metadata_field(field_id, &tmp); - if (ret) - return ret; - - *ts_member = tmp; - - return 0; -} - -struct field_mapping { - u64 field_id; - int offset; -}; - -#define TD_SYSINFO_MAP(_field_id, _offset) \ - { .field_id = MD_FIELD_ID_##_field_id, \ - .offset = offsetof(struct tdx_sys_info_tdmr, _offset) } - -/* Map TD_SYSINFO fields into 'struct tdx_sys_info_tdmr': */ -static const struct field_mapping fields[] = { - TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), - TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), - TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_4k_entry_size), - TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_2m_entry_size), - TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_1g_entry_size), -}; - -static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) -{ - int ret; - int i; - - /* Populate 'sysinfo_tdmr' fields using the mapping structure above: */ - for (i = 0; i < ARRAY_SIZE(fields); i++) { - ret = read_sys_metadata_field16(fields[i].field_id, - fields[i].offset, - sysinfo_tdmr); - if (ret) - return ret; - } - - return 0; -} - -static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) -{ - return get_tdx_sys_info_tdmr(&sysinfo->tdmr); -} +#include "tdx_global_metadata.c" /* Calculate the actual TDMR size */ static int tdmr_size_single(u16 max_reserved_per_tdmr) diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index ec879d54eb5c..641beec86e73 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -2,7 +2,7 @@ #ifndef _X86_VIRT_TDX_H #define _X86_VIRT_TDX_H -#include +#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX @@ -26,35 +26,6 @@ #define PT_NDA 0x0 #define PT_RSVD 0x1 -/* - * Global scope metadata field ID. - * - * See Table "Global Scope Metadata", TDX module 1.5 ABI spec. - */ -#define MD_FIELD_ID_MAX_TDMRS 0x9100000100000008ULL -#define MD_FIELD_ID_MAX_RESERVED_PER_TDMR 0x9100000100000009ULL -#define MD_FIELD_ID_PAMT_4K_ENTRY_SIZE 0x9100000100000010ULL -#define MD_FIELD_ID_PAMT_2M_ENTRY_SIZE 0x9100000100000011ULL -#define MD_FIELD_ID_PAMT_1G_ENTRY_SIZE 0x9100000100000012ULL - -/* - * Sub-field definition of metadata field ID. - * - * See Table "MD_FIELD_ID (Metadata Field Identifier / Sequence Header) - * Definition", TDX module 1.5 ABI spec. - * - * - Bit 33:32: ELEMENT_SIZE_CODE -- size of a single element of metadata - * - * 0: 8 bits - * 1: 16 bits - * 2: 32 bits - * 3: 64 bits - */ -#define MD_FIELD_ID_ELE_SIZE_CODE(_field_id) \ - (((_field_id) & GENMASK_ULL(33, 32)) >> 32) - -#define MD_FIELD_ID_ELE_SIZE_16BIT 1 - struct tdmr_reserved_area { u64 offset; u64 size; @@ -80,20 +51,6 @@ struct tdmr_info { DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); } __packed __aligned(TDMR_INFO_ALIGNMENT); -/* Class "TDMR info" */ -struct tdx_sys_info_tdmr { - u16 max_tdmrs; - u16 max_reserved_per_tdmr; - u16 pamt_4k_entry_size; - u16 pamt_2m_entry_size; - u16 pamt_1g_entry_size; -}; - -/* Kernel used global metadata fields */ -struct tdx_sys_info { - struct tdx_sys_info_tdmr tdmr; -}; - /* * Do not put any hardware-defined TDX structure representations below * this comment! From 6f5c71cc42d49203771bceed91a023d4dbec54f4 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 15 Dec 2024 04:15:47 +1300 Subject: [PATCH 159/224] x86/virt/tdx: Require the module to assert it has the NO_RBP_MOD mitigation Old TDX modules can clobber RBP in the TDH.VP.ENTER SEAMCALL. However RBP is used as frame pointer in the x86_64 calling convention, and clobbering RBP could result in bad things like being unable to unwind the stack if any non-maskable exceptions (NMI, #MC etc) happens in that gap. A new "NO_RBP_MOD" feature was introduced to more recent TDX modules to not clobber RBP. KVM will need to use the TDH.VP.ENTER SEAMCALL to run TDX guests. It won't be safe to run TDX guests w/o this feature. To prevent it, just don't initialize the TDX module if this feature is not supported [1]. Note the bit definitions of TDX_FEATURES0 are not auto-generated in tdx_global_metadata.h. Manually define a macro for it in "tdx.h". Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Reviewed-by: Adrian Hunter Reviewed-by: Dan Williams Link: https://lore.kernel.org/fc0e8ab7-86d4-4428-be31-82e1ece6dd21@intel.com/ [1] Link: https://lore.kernel.org/all/76ae5025502c84d799e3a56a6fc4f69a82da8f93.1734188033.git.kai.huang%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 17 +++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 43ec56db5084..7fdb37387886 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -272,6 +272,18 @@ static int read_sys_metadata_field(u64 field_id, u64 *data) #include "tdx_global_metadata.c" +static int check_features(struct tdx_sys_info *sysinfo) +{ + u64 tdx_features0 = sysinfo->features.tdx_features0; + + if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) { + pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n"); + return -EINVAL; + } + + return 0; +} + /* Calculate the actual TDMR size */ static int tdmr_size_single(u16 max_reserved_per_tdmr) { @@ -1055,6 +1067,11 @@ static int init_tdx_module(void) if (ret) return ret; + /* Check whether the kernel can support this module */ + ret = check_features(&sysinfo); + if (ret) + return ret; + /* * To keep things simple, assume that all TDX-protected memory * will come from the page allocator. Make sure all pages in the diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 641beec86e73..4e3d533cdd61 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -2,6 +2,7 @@ #ifndef _X86_VIRT_TDX_H #define _X86_VIRT_TDX_H +#include #include "tdx_global_metadata.h" /* @@ -51,6 +52,9 @@ struct tdmr_info { DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); } __packed __aligned(TDMR_INFO_ALIGNMENT); +/* Bit definitions of TDX_FEATURES0 metadata field */ +#define TDX_FEATURES0_NO_RBP_MOD BIT(18) + /* * Do not put any hardware-defined TDX structure representations below * this comment! From 8148fa2e022bae29f21bb9a2c4cc796334fd372b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 9 Dec 2024 19:08:10 +0200 Subject: [PATCH 160/224] lockdep: Mark chain_hlock_class_idx() with __maybe_unused When chain_hlock_class_idx() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y, CONFIG_LOCKDEP=y and CONFIG_PROVE_LOCKING=n: kernel/locking/lockdep.c:435:28: error: unused function 'chain_hlock_class_idx' [-Werror,-Wunused-function] Fix this by marking it with __maybe_unused. See also commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). [Boqun: add more config information of the error] Signed-off-by: Andy Shevchenko Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241209170810.1485183-1-andriy.shevchenko@linux.intel.com --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2d8ec0351ef9..fe04a2145ca7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -430,7 +430,7 @@ static inline u16 hlock_id(struct held_lock *hlock) return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); } -static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id) { return hlock_id & (MAX_LOCKDEP_KEYS - 1); } From 3430600925859be3c8588b8220173758c7860e8c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Dec 2024 21:34:45 +0200 Subject: [PATCH 161/224] lockdep: Move lockdep_assert_locked() under #ifdef CONFIG_PROVE_LOCKING When lockdep_assert_locked() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y, CONFIG_LOCKDEP=y and CONFIG_PROVE_LOCKING=n: kernel/locking/lockdep.c:160:20: error: unused function 'lockdep_assert_locked' [-Werror,-Wunused-function] Fix this by moving it under the respective ifdeffery. See also commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). [Boqun: add more config information of the error] Signed-off-by: Andy Shevchenko Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241202193445.769567-1-andriy.shevchenko@linux.intel.com --- kernel/locking/lockdep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index fe04a2145ca7..29acd238dad7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -157,10 +157,12 @@ static inline void lockdep_unlock(void) __this_cpu_dec(lockdep_recursion); } +#ifdef CONFIG_PROVE_LOCKING static inline bool lockdep_assert_locked(void) { return DEBUG_LOCKS_WARN_ON(__owner != current); } +#endif static struct task_struct *lockdep_selftest_task_struct; From 9793c9bb91f1b05473bb6d4a2323a259ef00ff2e Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 27 Nov 2024 10:30:24 -0800 Subject: [PATCH 162/224] locking: MAINTAINERS: Start watching Rust locking primitives It makes sense to add Rust locking primitives under the watch of general locking primitives maintainers. This will encourage more reviews and find potential issues earlier. Hence add related Rust files into the LOCKING PRIMITIVES entry in MAINTAINERS. While we are at it, change the role of myself into the maintainer of LOCKDEP and RUST to reflect my responsibility for the corresponding code. Acked-by: Miguel Ojeda Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar Signed-off-by: Boqun Feng https://lore.kernel.org/lkml/20241128054022.19586-2-boqun.feng@gmail.com/ --- MAINTAINERS | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..e0495700914d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13423,8 +13423,8 @@ LOCKING PRIMITIVES M: Peter Zijlstra M: Ingo Molnar M: Will Deacon +M: Boqun Feng (LOCKDEP & RUST) R: Waiman Long -R: Boqun Feng (LOCKDEP) L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core @@ -13438,6 +13438,11 @@ F: include/linux/seqlock.h F: include/linux/spinlock*.h F: kernel/locking/ F: lib/locking*.[ch] +F: rust/helpers/mutex.c +F: rust/helpers/spinlock.c +F: rust/kernel/sync/lock.rs +F: rust/kernel/sync/lock/ +F: rust/kernel/sync/locked_by.rs X: kernel/locking/locktorture.c LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks) From 15abc88057eeec052aefde897df277eca2340ac6 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Tue, 19 Nov 2024 18:11:03 -0500 Subject: [PATCH 163/224] rust: sync: Add Lock::from_raw() for Lock<(), B> The KMS bindings [1] have a few bindings that require manually acquiring specific locks before calling certain functions. At the moment though, the only way of acquiring these locks in bindings is to simply call the C locking functions directly - since said locks are not initialized on the Rust side of things. However - if we add `#[repr(C)]` to `Lock<(), B>`, then given `()` is a ZST - `Lock<(), B>` becomes equivalent in data layout to its inner `B::State` type. Since locks in C don't have data explicitly associated with them anyway, we can take advantage of this to add a `Lock::from_raw()` function that can translate a raw pointer to `B::State` into its proper `Lock<(), B>` equivalent. This lets us simply acquire a reference to the lock in question and work with it like it was initialized on the Rust side of things, allowing us to use less unsafe code to implement bindings with lock requirements. [Boqun: Use "Link:" instead of a URL and format the commit log] Signed-off-by: Lyude Paul Reviewed-by: Alice Ryhl Link: https://patchwork.freedesktop.org/series/131522/ [1] Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241119231146.2298971-2-lyude@redhat.com --- rust/kernel/sync/lock.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index 41dcddac69e2..57dc2e90e504 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -96,6 +96,7 @@ pub unsafe trait Backend { /// /// Exposes one of the kernel locking primitives. Which one is exposed depends on the lock /// [`Backend`] specified as the generic parameter `B`. +#[repr(C)] #[pin_data] pub struct Lock { /// The kernel lock object. @@ -134,6 +135,28 @@ impl Lock { } } +impl Lock<(), B> { + /// Constructs a [`Lock`] from a raw pointer. + /// + /// This can be useful for interacting with a lock which was initialised outside of Rust. + /// + /// # Safety + /// + /// The caller promises that `ptr` points to a valid initialised instance of [`State`] during + /// the whole lifetime of `'a`. + /// + /// [`State`]: Backend::State + pub unsafe fn from_raw<'a>(ptr: *mut B::State) -> &'a Self { + // SAFETY: + // - By the safety contract `ptr` must point to a valid initialised instance of `B::State` + // - Since the lock data type is `()` which is a ZST, `state` is the only non-ZST member of + // the struct + // - Combined with `#[repr(C)]`, this guarantees `Self` has an equivalent data layout to + // `B::State`. + unsafe { &*ptr.cast() } + } +} + impl Lock { /// Acquires the lock and gives the caller access to the data protected by it. pub fn lock(&self) -> Guard<'_, T, B> { From daa03fe50ec376aeadd63a264c471c56af194e83 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Tue, 19 Nov 2024 18:11:04 -0500 Subject: [PATCH 164/224] rust: sync: Make Guard::new() public Since we added a `Lock::from_raw()` function previously, it makes sense to also introduce an interface for creating a `Guard` from a reference to a `Lock` for instances where we've derived the `Lock` from a raw pointer and know that the lock is already acquired, there are such usages in KMS API. [Boqun: Add backquotes to type names, reformat the commit log, reword a bit on the usage of KMS API] Signed-off-by: Lyude Paul Reviewed-by: Filipe Xavier Reviewed-by: Alice Ryhl Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241119231146.2298971-3-lyude@redhat.com --- rust/kernel/sync/lock.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index 57dc2e90e504..72dbf3fbb259 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -234,7 +234,7 @@ impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { /// # Safety /// /// The caller must ensure that it owns the lock. - pub(crate) unsafe fn new(lock: &'a Lock, state: B::GuardState) -> Self { + pub unsafe fn new(lock: &'a Lock, state: B::GuardState) -> Self { Self { lock, state, From 37624dde4768ec25d2f9798aa75bf32e18c0eae2 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 20 Nov 2024 17:26:28 -0500 Subject: [PATCH 165/224] rust: sync: Add MutexGuard type alias A simple helper alias for code that needs to deal with Guard types returned from Mutexes. Signed-off-by: Lyude Paul Reviewed-by: Alice Ryhl Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241120222742.2490495-2-lyude@redhat.com --- rust/kernel/sync.rs | 2 +- rust/kernel/sync/lock/mutex.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 1eab7ebf25fd..2721b5c8deda 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -16,7 +16,7 @@ pub mod poll; pub use arc::{Arc, ArcBorrow, UniqueArc}; pub use condvar::{new_condvar, CondVar, CondVarTimeoutResult}; pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, GlobalLockedBy}; -pub use lock::mutex::{new_mutex, Mutex}; +pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; pub use lock::spinlock::{new_spinlock, SpinLock}; pub use locked_by::LockedBy; diff --git a/rust/kernel/sync/lock/mutex.rs b/rust/kernel/sync/lock/mutex.rs index 0e946ebefce1..10a70c07268d 100644 --- a/rust/kernel/sync/lock/mutex.rs +++ b/rust/kernel/sync/lock/mutex.rs @@ -86,6 +86,14 @@ pub use new_mutex; /// [`struct mutex`]: srctree/include/linux/mutex.h pub type Mutex = super::Lock; +/// A [`Guard`] acquired from locking a [`Mutex`]. +/// +/// This is simply a type alias for a [`Guard`] returned from locking a [`Mutex`]. It will unlock +/// the [`Mutex`] upon being dropped. +/// +/// [`Guard`]: super::Guard +pub type MutexGuard<'a, T> = super::Guard<'a, T, MutexBackend>; + /// A kernel `struct mutex` lock backend. pub struct MutexBackend; From eb5ccb038284dc0e69822d71aafcbf7b57394aad Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 20 Nov 2024 17:26:29 -0500 Subject: [PATCH 166/224] rust: sync: Add SpinLockGuard type alias A simple helper alias for code that needs to deal with Guard types returned from SpinLocks. Signed-off-by: Lyude Paul Reviewed-by: Alice Ryhl Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241120222742.2490495-3-lyude@redhat.com --- rust/kernel/sync.rs | 2 +- rust/kernel/sync/lock/spinlock.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 2721b5c8deda..dffdaad972ce 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -17,7 +17,7 @@ pub use arc::{Arc, ArcBorrow, UniqueArc}; pub use condvar::{new_condvar, CondVar, CondVarTimeoutResult}; pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, GlobalLockedBy}; pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; -pub use lock::spinlock::{new_spinlock, SpinLock}; +pub use lock::spinlock::{new_spinlock, SpinLock, SpinLockGuard}; pub use locked_by::LockedBy; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. diff --git a/rust/kernel/sync/lock/spinlock.rs b/rust/kernel/sync/lock/spinlock.rs index 9f4d128bed98..081c0220013b 100644 --- a/rust/kernel/sync/lock/spinlock.rs +++ b/rust/kernel/sync/lock/spinlock.rs @@ -87,6 +87,14 @@ pub type SpinLock = super::Lock; /// A kernel `spinlock_t` lock backend. pub struct SpinLockBackend; +/// A [`Guard`] acquired from locking a [`SpinLock`]. +/// +/// This is simply a type alias for a [`Guard`] returned from locking a [`SpinLock`]. It will unlock +/// the [`SpinLock`] upon being dropped. +/// +/// [`Guard`]: super::Guard +pub type SpinLockGuard<'a, T> = super::Guard<'a, T, SpinLockBackend>; + // SAFETY: The underlying kernel `spinlock_t` object ensures mutual exclusion. `relock` uses the // default implementation that always calls the same locking method. unsafe impl super::Backend for SpinLockBackend { From fbd7a5a0359bc770e898d918d84977ea61163aad Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Mon, 25 Nov 2024 15:40:58 -0500 Subject: [PATCH 167/224] rust: sync: Add lock::Backend::assert_is_held() Since we've exposed Lock::from_raw() and Guard::new() publically, we want to be able to make sure that we assert that a lock is actually held when constructing a Guard for it to handle instances of unsafe Guard::new() calls outside of our lock module. Hence add a new method assert_is_held() to Backend, which uses lockdep to check whether or not a lock has been acquired. When lockdep is disabled, this has no overhead. [Boqun: Resolve the conflicts with exposing Guard::new(), reword the commit log a bit and format "unsafe { ; }" into "unsafe { }" for the consistency. ] Signed-off-by: Lyude Paul Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20241125204139.656801-1-lyude@redhat.com --- rust/helpers/mutex.c | 5 +++++ rust/helpers/spinlock.c | 5 +++++ rust/kernel/sync/lock.rs | 10 ++++++++++ rust/kernel/sync/lock/mutex.rs | 5 +++++ rust/kernel/sync/lock/spinlock.rs | 5 +++++ 5 files changed, 30 insertions(+) diff --git a/rust/helpers/mutex.c b/rust/helpers/mutex.c index 7e00680958ef..06575553eda5 100644 --- a/rust/helpers/mutex.c +++ b/rust/helpers/mutex.c @@ -12,3 +12,8 @@ void rust_helper___mutex_init(struct mutex *mutex, const char *name, { __mutex_init(mutex, name, key); } + +void rust_helper_mutex_assert_is_held(struct mutex *mutex) +{ + lockdep_assert_held(mutex); +} diff --git a/rust/helpers/spinlock.c b/rust/helpers/spinlock.c index 5971fdf6f755..42c4bf01a23e 100644 --- a/rust/helpers/spinlock.c +++ b/rust/helpers/spinlock.c @@ -30,3 +30,8 @@ int rust_helper_spin_trylock(spinlock_t *lock) { return spin_trylock(lock); } + +void rust_helper_spin_assert_is_held(spinlock_t *lock) +{ + lockdep_assert_held(lock); +} diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index 72dbf3fbb259..eb80048e0110 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -90,6 +90,13 @@ pub unsafe trait Backend { // SAFETY: The safety requirements ensure that the lock is initialised. *guard_state = unsafe { Self::lock(ptr) }; } + + /// Asserts that the lock is held using lockdep. + /// + /// # Safety + /// + /// Callers must ensure that [`Backend::init`] has been previously called. + unsafe fn assert_is_held(ptr: *mut Self::State); } /// A mutual exclusion primitive. @@ -235,6 +242,9 @@ impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { /// /// The caller must ensure that it owns the lock. pub unsafe fn new(lock: &'a Lock, state: B::GuardState) -> Self { + // SAFETY: The caller can only hold the lock if `Backend::init` has already been called. + unsafe { B::assert_is_held(lock.state.get()) }; + Self { lock, state, diff --git a/rust/kernel/sync/lock/mutex.rs b/rust/kernel/sync/lock/mutex.rs index 10a70c07268d..70cadbc2e8e2 100644 --- a/rust/kernel/sync/lock/mutex.rs +++ b/rust/kernel/sync/lock/mutex.rs @@ -134,4 +134,9 @@ unsafe impl super::Backend for MutexBackend { None } } + + unsafe fn assert_is_held(ptr: *mut Self::State) { + // SAFETY: The `ptr` pointer is guaranteed to be valid and initialized before use. + unsafe { bindings::mutex_assert_is_held(ptr) } + } } diff --git a/rust/kernel/sync/lock/spinlock.rs b/rust/kernel/sync/lock/spinlock.rs index 081c0220013b..ab2f8d075311 100644 --- a/rust/kernel/sync/lock/spinlock.rs +++ b/rust/kernel/sync/lock/spinlock.rs @@ -133,4 +133,9 @@ unsafe impl super::Backend for SpinLockBackend { None } } + + unsafe fn assert_is_held(ptr: *mut Self::State) { + // SAFETY: The `ptr` pointer is guaranteed to be valid and initialized before use. + unsafe { bindings::spin_assert_is_held(ptr) } + } } From aa135d1d0902c49ed45bec98c61c1b4022652b7e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 20 Dec 2024 09:40:29 +0100 Subject: [PATCH 168/224] x86/mm: Remove unnecessary include of The header file linux/extable.h is included for search_exception_tables(). That function is no longer used since commit: c2508ec5a58db ("mm: introduce new 'lock_mm_and_find_vma()' page fault helper") Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220084029.473617-1-bigeasy@linutronix.de --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e6c469b323cc..ef12ff3db903 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -7,7 +7,6 @@ #include /* test_thread_flag(), ... */ #include /* task_stack_*(), ... */ #include /* oops_begin/end, ... */ -#include /* search_exception_tables */ #include /* max_low_pfn */ #include /* kfence_handle_page_fault */ #include /* NOKPROBE_SYMBOL, ... */ From ee8118c1f1864eab709fb660d3af8545cf11ae96 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 19 Dec 2024 09:58:39 +0100 Subject: [PATCH 169/224] sched/fair: Update comments after sched_tick() rename. scheduler_tick() was renamed to sched_tick() in 86dd6c04ef9f2 ("sched/balancing: Rename scheduler_tick() => sched_tick()"). Update comments still referring to scheduler_tick. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241219085839.302378-1-bigeasy@linutronix.de --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8f641c9e74a8..ae8095aa4585 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12868,9 +12868,9 @@ out: /* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * - * - directly from the local scheduler_tick() for periodic load balancing + * - directly from the local sched_tick() for periodic load balancing * - * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * - indirectly from a remote sched_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void sched_balance_softirq(void) From a430d99e349026d53e2557b7b22bd2ebd61fe12a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Dec 2024 06:32:19 +0000 Subject: [PATCH 170/224] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled during load balance. This value is incremented in can_migrate_task() if the task is migratable and hot. After incrementing the value, load balancer can still decide not to migrate this task leading to wrong accounting. Fix this by incrementing stats when hot tasks are detached. This issue only exists in detach_tasks() where we can decide to not migrate hot task even if it is migratable. However, in detach_one_task(), we migrate it unconditionally. [Swapnil: Handled the case where nr_failed_migrations_hot was not accounted properly and wrote commit log] Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead") Signed-off-by: Peter Zijlstra (Intel) Reported-by: "Gautham R. Shenoy" Not-yet-signed-off-by: Peter Zijlstra Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-2-swapnil.sapkal@amd.com --- include/linux/sched.h | 1 + kernel/sched/fair.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b5916be49f62..8c6a2ed9f80e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -937,6 +937,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae8095aa4585..8fc6648a0aa8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9396,6 +9396,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot; lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: @@ -9472,10 +9474,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->stats.nr_forced_migrations); - } + if (tsk_cache_hot == 1) + p->sched_task_hot = 1; return 1; } @@ -9490,6 +9490,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9650,6 +9656,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } From c3856c9ce6b8903909b61e8d2985a3c7ec7a78e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Dec 2024 06:32:20 +0000 Subject: [PATCH 171/224] sched/fair: Cleanup in migrate_degrades_locality() to improve readability migrate_degrade_locality() would return {1, 0, -1} respectively to indicate that migration would degrade-locality, would improve locality, would be ambivalent to locality improvements. This patch improves readability by changing the return value to mean: * Any positive value degrades locality * 0 migration doesn't affect locality * Any negative value improves locality [Swapnil: Fixed comments around code and wrote commit log] Signed-off-by: Peter Zijlstra (Intel) Not-yet-signed-off-by: Peter Zijlstra Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-3-swapnil.sapkal@amd.com --- kernel/sched/fair.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fc6648a0aa8..e5c0c61909b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9329,43 +9329,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -9376,14 +9376,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif @@ -9393,7 +9393,7 @@ static inline int migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; lockdep_assert_rq_held(env->src_rq); if (p->sched_task_hot) @@ -9468,13 +9468,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_ACTIVE_LB) return 1; - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) p->sched_task_hot = 1; return 1; } From 3b2a793ea70fd14136b442df31e53935e8095034 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:21 +0000 Subject: [PATCH 172/224] sched: Report the different kinds of imbalances in /proc/schedstat In /proc/schedstat, lb_imbalance reports the sum of imbalances discovered in sched domains with each call to sched_balance_rq(), which is not very useful because lb_imbalance does not mention whether the imbalance is due to load, utilization, nr_tasks or misfit_tasks. Remove this field from /proc/schedstat. Currently there is no field in /proc/schedstat to report different types of imbalances. Introduce new fields in /proc/schedstat to report the total imbalances in load, utilization, nr_tasks or misfit_tasks. Added fields to /proc/schedstat: - lb_imbalance_load: Total imbalance due to load. - lb_imbalance_util: Total imbalance due to utilization. - lb_imbalance_task: Total imbalance due to number of tasks. - lb_imbalance_misfit: Total imbalance due to misfit tasks. Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20241220063224.17767-4-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 5 ++++- kernel/sched/fair.c | 24 +++++++++++++++++++++++- kernel/sched/stats.c | 7 +++++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4237daa5ac7a..76a662e1ec24 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -114,7 +114,10 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e5c0c61909b7..b3418b5d484f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11705,6 +11705,28 @@ static int should_we_balance(struct lb_env *env) return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -11755,7 +11777,7 @@ redo: WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index eb0cdcd4d921..802bd9398a2e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -141,11 +141,14 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %*pb", dcount++, cpumask_pr_args(sched_domain_span(sd))); for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], From 1c055a0f5d3bafaca5d218bbb3e4e63d6307be45 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:22 +0000 Subject: [PATCH 173/224] sched: Move sched domain name out of CONFIG_SCHED_DEBUG /proc/schedstat file shows cpu and sched domain level scheduler statistics. It does not show domain name instead shows domain level. It will be very useful for tools like `perf sched stats`[1] to aggragate domain level stats if domain names are shown in /proc/schedstat. But sched domain name is guarded by CONFIG_SCHED_DEBUG. As per the discussion[2], move sched domain name out of CONFIG_SCHED_DEBUG. [1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/ [2] https://lore.kernel.org/lkml/fcefeb4d-3acb-462d-9c9b-3df8d927e522@amd.com/ Suggested-by: "Gautham R. Shenoy" Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-5-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 8 -------- kernel/sched/topology.c | 4 ---- 2 files changed, 12 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 76a662e1ec24..7f3dbafe1817 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -143,9 +143,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -201,18 +199,12 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9c405f0e7b26..da33ec9e94ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), From 011b3a14dc66c40066d08d60a768e14ede7ef351 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 20 Dec 2024 06:32:23 +0000 Subject: [PATCH 174/224] sched/stats: Print domain name in /proc/schedstat Currently, there does not exist a straightforward way to extract the names of the sched domains and match them to the per-cpu domain entry in /proc/schedstat other than looking at the debugfs files which are only visible after enabling "verbose" debug after commit 34320745dfc9 ("sched/debug: Put sched/domains files under the verbose flag") Since tools like `perf sched stats`[1] require displaying per-domain information in user friendly manner, display the names of sched domain, alongside their level in /proc/schedstat. Domain names also makes the /proc/schedstat data unambiguous when some of the cpus are offline. For example, on a 128 cpus AMD Zen3 machine where CPU0 and CPU64 are SMT siblings and CPU64 is offline: Before: cpu0 ... domain0 ... domain1 ... cpu1 ... domain0 ... domain1 ... domain2 ... After: cpu0 ... domain0 MC ... domain1 PKG ... cpu1 ... domain0 SMT ... domain1 MC ... domain2 PKG ... [1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/ Signed-off-by: K Prateek Nayak Signed-off-by: Ravi Bangoria Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Tested-by: James Clark Link: https://lore.kernel.org/r/20241220063224.17767-6-swapnil.sapkal@amd.com --- kernel/sched/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 802bd9398a2e..5f563965976c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -138,7 +138,7 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", From 7c8cd569ff66755f17b0c0c03a9d8df1b6f3e9ed Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:24 +0000 Subject: [PATCH 175/224] docs: Update Schedstat version to 17 Update the Schedstat version to 17 as more fields are added to report different kinds of imbalances in the sched domain. Also domain field started printing corresponding domain name. Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-7-swapnil.sapkal@amd.com --- Documentation/scheduler/sched-stats.rst | 128 ++++++++++++++---------- kernel/sched/stats.c | 2 +- 2 files changed, 77 insertions(+), 53 deletions(-) diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index 7c2b16c4729d..caea83d91c67 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -2,6 +2,12 @@ Scheduler Statistics ==================== +Version 17 of schedstats removed 'lb_imbalance' field as it has no +significance anymore and instead added more relevant fields namely +'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and +'lb_imbalance_misfit'. The domain field prints the name of the +corresponding sched domain from this version onwards. + Version 16 of schedstats changed the order of definitions within 'enum cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns in show_schedstat(). In particular the position of CPU_IDLE @@ -9,7 +15,9 @@ and __CPU_NOT_IDLE changed places. The size of the array is unchanged. Version 15 of schedstats dropped counters for some sched_yield: yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is -identical to version 14. +identical to version 14. Details are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-stats.txt?id=1e1dbb259c79b Version 14 of schedstats includes support for sched_domains, which hit the mainline kernel in 2.6.20 although it is identical to the stats from version @@ -26,7 +34,14 @@ cpus on the machine, while domain0 is the most tightly focused domain, sometimes balancing only between pairs of cpus. At this time, there are no architectures which need more than three domain levels. The first field in the domain stats is a bit map indicating which cpus are affected -by that domain. +by that domain. Details are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=b762f3ffb797c + +The schedstat documentation is maintained version 10 onwards and is not +updated for version 11 and 12. The details for version 10 are available at + + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/sched-stats.txt?id=1da177e4c3f4 These fields are counters, and only increment. Programs which make use of these will need to start with a baseline observation and then calculate @@ -71,88 +86,97 @@ Domain statistics ----------------- One of these is produced per domain for each cpu described. (Note that if CONFIG_SMP is not defined, *no* domains are utilized and these lines -will not appear in the output.) +will not appear in the output. is an extension to the domain field +that prints the name of the corresponding sched domain. It can appear in +schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.) -domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 +domain 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 The first field is a bit mask indicating what cpus this domain operates over. -The next 24 are a variety of sched_balance_rq() statistics in grouped into types -of idleness (idle, busy, and newly idle): +The next 33 are a variety of sched_balance_rq() statistics in grouped into types +of idleness (busy, idle and newly idle): 1) # of times in this domain sched_balance_rq() was called when the - cpu was idle - 2) # of times in this domain sched_balance_rq() checked but found - the load did not require balancing when the cpu was idle - 3) # of times in this domain sched_balance_rq() tried to move one or - more tasks and failed, when the cpu was idle - 4) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was idle - 5) # of times in this domain pull_task() was called when the cpu - was idle - 6) # of times in this domain pull_task() was called even though - the target task was cache-hot when idle - 7) # of times in this domain sched_balance_rq() was called but did - not find a busier queue while the cpu was idle - 8) # of times in this domain a busier queue was found while the - cpu was idle but no busier group was found - 9) # of times in this domain sched_balance_rq() was called when the cpu was busy - 10) # of times in this domain sched_balance_rq() checked but found the + 2) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when busy - 11) # of times in this domain sched_balance_rq() tried to move one or + 3) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was busy - 12) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was busy - 13) # of times in this domain pull_task() was called when busy - 14) # of times in this domain pull_task() was called even though the + 4) Total imbalance in load when the cpu was busy + 5) Total imbalance in utilization when the cpu was busy + 6) Total imbalance in number of tasks when the cpu was busy + 7) Total imbalance due to misfit tasks when the cpu was busy + 8) # of times in this domain pull_task() was called when busy + 9) # of times in this domain pull_task() was called even though the target task was cache-hot when busy - 15) # of times in this domain sched_balance_rq() was called but did not + 10) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was busy - 16) # of times in this domain a busier queue was found while the cpu + 11) # of times in this domain a busier queue was found while the cpu was busy but no busier group was found - 17) # of times in this domain sched_balance_rq() was called when the - cpu was just becoming idle - 18) # of times in this domain sched_balance_rq() checked but found the + 12) # of times in this domain sched_balance_rq() was called when the + cpu was idle + 13) # of times in this domain sched_balance_rq() checked but found + the load did not require balancing when the cpu was idle + 14) # of times in this domain sched_balance_rq() tried to move one or + more tasks and failed, when the cpu was idle + 15) Total imbalance in load when the cpu was idle + 16) Total imbalance in utilization when the cpu was idle + 17) Total imbalance in number of tasks when the cpu was idle + 18) Total imbalance due to misfit tasks when the cpu was idle + 19) # of times in this domain pull_task() was called when the cpu + was idle + 20) # of times in this domain pull_task() was called even though + the target task was cache-hot when idle + 21) # of times in this domain sched_balance_rq() was called but did + not find a busier queue while the cpu was idle + 22) # of times in this domain a busier queue was found while the + cpu was idle but no busier group was found + + 23) # of times in this domain sched_balance_rq() was called when the + was just becoming idle + 24) # of times in this domain sched_balance_rq() checked but found the load did not require balancing when the cpu was just becoming idle - 19) # of times in this domain sched_balance_rq() tried to move one or more + 25) # of times in this domain sched_balance_rq() tried to move one or more tasks and failed, when the cpu was just becoming idle - 20) sum of imbalances discovered (if any) with each call to - sched_balance_rq() in this domain when the cpu was just becoming idle - 21) # of times in this domain pull_task() was called when newly idle - 22) # of times in this domain pull_task() was called even though the + 26) Total imbalance in load when the cpu was just becoming idle + 27) Total imbalance in utilization when the cpu was just becoming idle + 28) Total imbalance in number of tasks when the cpu was just becoming idle + 29) Total imbalance due to misfit tasks when the cpu was just becoming idle + 30) # of times in this domain pull_task() was called when newly idle + 31) # of times in this domain pull_task() was called even though the target task was cache-hot when just becoming idle - 23) # of times in this domain sched_balance_rq() was called but did not + 32) # of times in this domain sched_balance_rq() was called but did not find a busier queue while the cpu was just becoming idle - 24) # of times in this domain a busier queue was found while the cpu + 33) # of times in this domain a busier queue was found while the cpu was just becoming idle but no busier group was found Next three are active_load_balance() statistics: - 25) # of times active_load_balance() was called - 26) # of times active_load_balance() tried to move a task and failed - 27) # of times active_load_balance() successfully moved a task + 34) # of times active_load_balance() was called + 35) # of times active_load_balance() tried to move a task and failed + 36) # of times active_load_balance() successfully moved a task Next three are sched_balance_exec() statistics: - 28) sbe_cnt is not used - 29) sbe_balanced is not used - 30) sbe_pushed is not used + 37) sbe_cnt is not used + 38) sbe_balanced is not used + 39) sbe_pushed is not used Next three are sched_balance_fork() statistics: - 31) sbf_cnt is not used - 32) sbf_balanced is not used - 33) sbf_pushed is not used + 40) sbf_cnt is not used + 41) sbf_balanced is not used + 42) sbf_pushed is not used Next three are try_to_wake_up() statistics: - 34) # of times in this domain try_to_wake_up() awoke a task that + 43) # of times in this domain try_to_wake_up() awoke a task that last ran on a different cpu in this domain - 35) # of times in this domain try_to_wake_up() moved a task to the + 44) # of times in this domain try_to_wake_up() moved a task to the waking cpu because it was cache-cold on its own cpu anyway - 36) # of times in this domain try_to_wake_up() started passive balancing + 45) # of times in this domain try_to_wake_up() started passive balancing /proc//schedstat --------------------- diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 5f563965976c..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 16 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { From abfdccd6af2b071951633e57d6322c46a1ea791f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Dec 2024 20:07:35 -0800 Subject: [PATCH 176/224] sched/wake_q: Add helper to call wake_up_q after unlock with preemption disabled A common pattern seen when wake_qs are used to defer a wakeup until after a lock is released is something like: preempt_disable(); raw_spin_unlock(lock); wake_up_q(wake_q); preempt_enable(); So create some raw_spin_unlock*_wake() helper functions to clean this up. Applies on top of the fix I submitted here: https://lore.kernel.org/lkml/20241212222138.2400498-1-jstultz@google.com/ NOTE: I recognise the unlock()/unlock_irq()/unlock_irqrestore() variants creates its own duplication, which we could use a macro to generate the similar functions, but I often dislike how those generation macros making finding the actual implementation harder, so I left the three functions as is. If folks would prefer otherwise, let me know and I'll switch it. Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241217040803.243420-1-jstultz@google.com --- include/linux/sched/wake_q.h | 34 ++++++++++++++++++++++++++++++++++ kernel/futex/pi.c | 5 +---- kernel/locking/mutex.c | 16 ++++------------ kernel/locking/rtmutex.c | 32 +++++--------------------------- 4 files changed, 44 insertions(+), 43 deletions(-) diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 06cd8fb2f409..0f28b4623ad4 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} #endif /* _LINUX_SCHED_WAKE_Q_H */ diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index d62cca5ed8f4..daea650b16f5 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1020,10 +1020,7 @@ retry_private: * it sees the futex_q::pi_state. */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - preempt_disable(); - raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); if (ret) { if (ret == 1) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3302e52f0c96..b36f23de48f1 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -657,10 +657,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* Make sure we do wakeups before calling schedule */ - wake_up_q(&wake_q); - wake_q_init(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); schedule_preempt_disabled(); @@ -710,8 +707,7 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); preempt_enable(); return 0; @@ -720,10 +716,9 @@ err: __mutex_remove_waiter(lock, &waiter); err_early_kill: trace_contention_end(lock, ret); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); - wake_up_q(&wake_q); preempt_enable(); return ret; } @@ -935,10 +930,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 697a56d3d949..4a8df1800cbb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,13 +1292,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); - + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1642,13 +1636,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - if (wake_q) { - wake_up_q(wake_q); - wake_q_init(wake_q); - } - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1799,10 +1787,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, */ raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); rt_mutex_post_schedule(); return ret; @@ -1860,11 +1845,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) schedule_rtlock(); @@ -1893,10 +1874,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) raw_spin_lock_irqsave(&lock->wait_lock, flags); rtlock_slowlock_locked(lock, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #endif /* RT_MUTEX_BUILD_SPINLOCKS */ From 0e45818ec1896c2b4aee0ec6721022ad625ea531 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:03:17 -0800 Subject: [PATCH 177/224] perf/x86/intel: Support RDPMC metrics clear mode The new RDPMC enhancement, metrics clear mode, is to clear the PERF_METRICS-related resources as well as the fixed-function performance monitoring counter 3 after the read is performed. It is available for ring 3. The feature is enumerated by the IA32_PERF_CAPABILITIES.RDPMC_CLEAR_METRICS[bit 19]. To enable the feature, the IA32_FIXED_CTR_CTRL.METRICS_CLEAR_EN[bit 14] must be set. Two ways were considered to enable the feature. - Expose a knob in the sysfs globally. One user may affect the measurement of other users when changing the knob. The solution is dropped. - Introduce a new event format, metrics_clear, for the slots event to disable/enable the feature only for the current process. Users can utilize the feature as needed. The latter solution is implemented in the patch. The current KVM doesn't support the perf metrics yet. For virtualization, the feature can be enabled later separately. Suggested-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20241211160318.235056-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 20 +++++++++++++++++++- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 4 ++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2e1e26846050..e76e892f44cd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2816,6 +2816,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event) return; idx = INTEL_PMC_IDX_FIXED_SLOTS; + + if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR) + bits |= INTEL_FIXED_3_METRICS_CLEAR; } intel_set_masks(event, idx); @@ -4071,7 +4074,12 @@ static int intel_pmu_hw_config(struct perf_event *event) * is used in a metrics group, it too cannot support sampling. */ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { - if (event->attr.config1 || event->attr.config2) + /* The metrics_clear can only be set for the slots event */ + if (event->attr.config1 && + (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) + return -EINVAL; + + if (event->attr.config2) return -EINVAL; /* @@ -4680,6 +4688,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" ); PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ +PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + static ssize_t umask2_show(struct device *dev, struct device_attribute *attr, char *page) @@ -4699,6 +4709,7 @@ static struct device_attribute format_attr_umask2 = static struct attribute *format_evtsel_ext_attrs[] = { &format_attr_umask2.attr, &format_attr_eq.attr, + &format_attr_metrics_clear.attr, NULL }; @@ -4723,6 +4734,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) if (i == 1) return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + if (i == 2) { + union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap); + + return intel_cap.rdpmc_metrics_clear ? attr->mode : 0; + } + return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 82c6f45ce975..31c2771545a6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -624,6 +624,7 @@ union perf_capabilities { u64 pebs_output_pt_available:1; u64 pebs_timing_info:1; u64 anythread_deprecated:1; + u64 rdpmc_metrics_clear:1; }; u64 capabilities; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cb9c4679f45c..1ac79f361645 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -41,6 +41,7 @@ #define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) @@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code) #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_NUM 8 +#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 +#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) + static inline bool is_metric_idx(int idx) { return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; From 877818802c3e970f67ccb53012facc78bef5f97a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 11 Nov 2024 17:22:08 +0100 Subject: [PATCH 178/224] x86/bugs: Add SRSO_USER_KERNEL_NO support If the machine has: CPUID Fn8000_0021_EAX[30] (SRSO_USER_KERNEL_NO) -- If this bit is 1, it indicates the CPU is not subject to the SRSO vulnerability across user/kernel boundaries. have it fall back to IBPB on VMEXIT only, in the case it is going to run VMs: Speculative Return Stack Overflow: Mitigation: IBPB on VMEXIT only Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20241202120416.6054-2-bp@kernel.org --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 4 ++++ arch/x86/kernel/cpu/common.c | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 645aa360628d..0e2d81763615 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -465,6 +465,7 @@ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ +#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */ /* * Extended auxiliary flags: Linux defined - for features scattered in various diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 47a01d4028f6..5a505aa65489 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2615,6 +2615,9 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + goto ibpb_on_vmexit; + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code @@ -2658,6 +2661,7 @@ static void __init srso_select_mitigation(void) } break; +ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e9037690814..7e8d811b51c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1270,6 +1270,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), VULNBL_AMD(0x19, SRSO), + VULNBL_AMD(0x1a, SRSO), {} }; From 716f86b523d8ec3c17015ee0b03135c7aa6f2f08 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 13 Nov 2024 13:28:33 +0100 Subject: [PATCH 179/224] KVM: x86: Advertise SRSO_USER_KERNEL_NO to userspace SRSO_USER_KERNEL_NO denotes whether the CPU is affected by SRSO across user/kernel boundaries. Advertise it to guest userspace. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20241202120416.6054-3-bp@kernel.org --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ae0b438a2c99..f7e222953cab 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -821,7 +821,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | - F(WRMSR_XX_BASE_NS) + F(WRMSR_XX_BASE_NS) | F(SRSO_USER_KERNEL_NO) ); kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); From 1146f7429f610d51b886402f1f7a43faa08d814a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 2 Dec 2024 13:04:16 +0100 Subject: [PATCH 180/224] Documentation/kernel-parameters: Fix a typo in kvm.enable_virt_at_load text s/lode/load/ Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241202120416.6054-5-bp@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 316cccf5a244..262a94621446 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2769,7 +2769,7 @@ VMs, i.e. on the 0=>1 and 1=>0 transitions of the number of VMs. - Enabling virtualization at module lode avoids potential + Enabling virtualization at module load avoids potential latency for creation of the 0=>1 VM, as KVM serializes virtualization enabling across all online CPUs. The "cost" of enabling virtualization when KVM is loaded, From 288bba2f4c8be1e1b9c8bc2e087ce677faf9918a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 22 Nov 2024 22:07:07 +0100 Subject: [PATCH 181/224] x86/cpufeatures: Remove "AMD" from the comments to the AMD-specific leaf 0x8000001f.EAX is an AMD-specific leaf so there's no need to have "AMD" in almost every feature's comment. Zap it and make the text more readable this way. No functional changes. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241122210707.12742-1-bp@kernel.org --- arch/x86/include/asm/cpufeatures.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..09e1e54676f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -443,14 +443,14 @@ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ -#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ From c845cb8dbd2e1a804babfd13648026c3a7cfbc0b Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:00:57 +0800 Subject: [PATCH 182/224] x86/mce: Make several functions return bool Make several functions that return 0 or 1 return a boolean value for better readability. No functional changes are intended. Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-2-qiuxu.zhuo@intel.com --- arch/x86/include/asm/mce.h | 4 ++-- arch/x86/kernel/cpu/mce/amd.c | 10 +++++----- arch/x86/kernel/cpu/mce/core.c | 22 +++++++++++----------- arch/x86/kernel/cpu/mce/intel.c | 9 +++++---- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4543cf2eb5e8..ea9ca7689f6b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -276,7 +276,7 @@ static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} #endif -int mce_available(struct cpuinfo_x86 *c); +bool mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); bool mce_usable_address(struct mce *m); @@ -296,7 +296,7 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -int mce_notify_irq(void); +bool mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ca80fff1fea..018874b554cb 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -381,7 +381,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) return msr_high_bits & BIT(28); } -static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -389,7 +389,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } if (apic != msr) { @@ -399,15 +399,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) * was set is reserved. Return early here: */ if (mce_flags.smca) - return 0; + return false; pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } - return 1; + return true; }; /* Reprogram MCx_MISC MSR behind this threshold bank. */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7fb5556a0b53..167965bd2ac0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -492,10 +492,10 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs } } -int mce_available(struct cpuinfo_x86 *c) +bool mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) - return 0; + return false; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -1778,7 +1778,7 @@ static void mce_timer_delete_all(void) * Can be called from interrupt context, but not from machine check/NMI * context. */ -int mce_notify_irq(void) +bool mce_notify_irq(void) { /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); @@ -1789,9 +1789,9 @@ int mce_notify_irq(void) if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); - return 1; + return true; } - return 0; + return false; } EXPORT_SYMBOL_GPL(mce_notify_irq); @@ -2015,25 +2015,25 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) return 0; } -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return 0; + return false; switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); mce_flags.p5 = 1; - return 1; + return true; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); mce_flags.winchip = 1; - return 1; + return true; default: - return 0; + return false; } - return 0; + return false; } /* diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index b3cd2c61b11d..f863df0ff42c 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS]; */ #define CMCI_STORM_THRESHOLD 32749 -static int cmci_supported(int *banks) +static bool cmci_supported(int *banks) { u64 cap; if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) - return 0; + return false; /* * Vendor check is not strictly needed, but the initial @@ -89,10 +89,11 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 0; + return false; if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) - return 0; + return false; + rdmsrl(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); From 64a668fbea1b6ec06ddca66d09cc49352f063342 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:00:58 +0800 Subject: [PATCH 183/224] x86/mce/threshold: Remove the redundant this_cpu_dec_return() The 'storm' variable points to this_cpu_ptr(&storm_desc). Access the 'stormy_bank_count' field through the 'storm' to avoid calling this_cpu_*() on the same per-CPU variable twice. This minor optimization reduces the text size by 16 bytes. $ size threshold.o.* text data bss dec hex filename 1395 1664 0 3059 bf3 threshold.o.old 1379 1664 0 3043 be3 threshold.o.new No functional changes intended. Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Reviewed-by: Nikolay Borisov Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-3-qiuxu.zhuo@intel.com --- arch/x86/kernel/cpu/mce/threshold.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 89e31e1e5c9c..f4a007616468 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -90,7 +90,7 @@ void cmci_storm_end(unsigned int bank) storm->banks[bank].in_storm_mode = false; /* If no banks left in storm mode, stop polling. */ - if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + if (!--storm->stormy_bank_count) mce_timer_kick(false); } From c46945c9cac8437a674edb9d8fbe71511fb4acee Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:00:59 +0800 Subject: [PATCH 184/224] x86/mce: Make four functions return bool Make those functions whose callers only care about success or failure return a boolean value for better readability. Also, update the call sites accordingly as the polarities of all the return values have been flipped. No functional changes. Suggested-by: Thomas Gleixner Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-4-qiuxu.zhuo@intel.com --- arch/x86/kernel/cpu/mce/core.c | 12 ++++++------ arch/x86/kernel/cpu/mce/genpool.c | 29 ++++++++++++++--------------- arch/x86/kernel/cpu/mce/internal.h | 4 ++-- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 167965bd2ac0..ce6fe5e20805 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -151,7 +151,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -1911,14 +1911,14 @@ static void __mcheck_cpu_check_banks(void) } /* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; + return false; } /* This should be disabled by the BIOS, but isn't always */ @@ -2012,7 +2012,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (cfg->bootlog != 0) cfg->panic_timeout = 30; - return 0; + return true; } static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) @@ -2279,12 +2279,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { + if (!__mcheck_cpu_apply_quirks(c)) { mca_cfg.disabled = 1; return; } - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index d0be6dda0c14..3ca9c007a666 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -94,64 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce_hw_err *err) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; if (filter_mce(&err->m)) - return -EINVAL; + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { int mce_numrecords, mce_poolsz, order; struct gen_pool *gpool; - int ret = -ENOMEM; void *mce_pool; order = order_base_2(sizeof(struct mce_evt_llist)); gpool = gen_pool_create(order, -1); if (!gpool) - return ret; + return false; mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); mce_poolsz = mce_numrecords * (1 << order); mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); if (!mce_pool) { gen_pool_destroy(gpool); - return ret; + return false; } - ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1); - if (ret) { + + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { gen_pool_destroy(gpool); kfree(mce_pool); - return ret; + return false; } mce_evt_pool = gpool; - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 84f810598231..95a504ece43e 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -31,8 +31,8 @@ struct mce_evt_llist { void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce_hw_err *err); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); From 51a12c28bb9a043e9444db5bd214b00ec161a639 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 12 Dec 2024 22:01:00 +0800 Subject: [PATCH 185/224] x86/mce: Break up __mcheck_cpu_apply_quirks() Split each vendor specific part into its own helper function. Signed-off-by: Tony Luck Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Tested-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20241212140103.66964-5-qiuxu.zhuo@intel.com --- arch/x86/kernel/cpu/mce/core.c | 192 ++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 88 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ce6fe5e20805..3855ec2ed0e0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1910,101 +1910,117 @@ static void __mcheck_cpu_check_banks(void) } } +static void apply_quirks_amd(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + mca_cfg.bootlog = 0; + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].ctl = 0; + + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} + +static void apply_quirks_intel(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].init = false; + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; + + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} + +static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; + } +} + /* Add per CPU specific workarounds here */ static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + switch (c->x86_vendor) { + case X86_VENDOR_UNKNOWN: pr_info("unknown CPU type - not enabling MCE support\n"); return false; - } - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; - - /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. - */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; - - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; - - } - - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; - - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; - - if (c->x86_vfm == INTEL_SANDYBRIDGE_X) - mce_flags.snb_ifu_quirk = 1; - - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86_vfm == INTEL_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; - } - - if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } + case X86_VENDOR_AMD: + apply_quirks_amd(c); + break; + case X86_VENDOR_INTEL: + apply_quirks_intel(c); + break; + case X86_VENDOR_ZHAOXIN: + apply_quirks_zhaoxin(c); + break; } if (cfg->monarch_timeout < 0) From 359d7a98e3e3f88dbf45411427b284bb3bbbaea5 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:01:01 +0800 Subject: [PATCH 186/224] x86/mce: Convert family/model mixed checks to VFM-based checks Convert family/model mixed checks to VFM-based checks to make the code more compact. Simplify. [ bp: Drop the "what" from the commit message - it should be visible from the diff alone. ] Suggested-by: Sohil Mehta Suggested-by: Dave Hansen Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-6-qiuxu.zhuo@intel.com --- arch/x86/kernel/cpu/mce/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 3855ec2ed0e0..f90cbcb31a62 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1936,7 +1936,7 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) mce_banks[0].ctl = 0; /* @@ -1954,6 +1954,10 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; + /* * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled @@ -1962,22 +1966,21 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) * Don't ignore bank 0 completely because there could be a * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) mce_banks[0].init = false; /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - mca_cfg.monarch_timeout < 0) + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) mca_cfg.monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mca_cfg.bootlog < 0) + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) mca_cfg.bootlog = 0; if (c->x86_vfm == INTEL_SANDYBRIDGE_X) From 053d18057e6292462f1b3f9460dd0c1e34609f67 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 12 Dec 2024 22:01:02 +0800 Subject: [PATCH 187/224] x86/mce: Remove the redundant mce_hygon_feature_init() Get HYGON to directly call mce_amd_feature_init() and remove the redundant mce_hygon_feature_init(). Suggested-by: Yazen Ghannam Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Sohil Mehta Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20241212140103.66964-7-qiuxu.zhuo@intel.com --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/cpu/mce/core.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ea9ca7689f6b..eb2db07ef39c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -386,8 +386,6 @@ static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } - unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f90cbcb31a62..0dc00c9894c7 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2118,13 +2118,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); break; - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); - break; - } - + case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); + mce_amd_feature_init(c); break; case X86_VENDOR_CENTAUR: From a85c08aaa665b5436d325f6d7138732a0e1315ce Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Oct 2024 18:51:49 +0300 Subject: [PATCH 188/224] x86/microcode/AMD: Return bool from find_blobs_in_containers() Instead of open-coding the check for size/data move it inside the function and make it return a boolean indicating whether data was found or not. No functional changes. [ bp: Write @ret in find_blobs_in_containers() only on success. ] Signed-off-by: Nikolay Borisov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241018155151.702350-2-nik.borisov@suse.com --- arch/x86/kernel/cpu/microcode/amd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index fb5d0c67fbab..d395665d9691 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -569,14 +569,19 @@ static bool get_builtin_microcode(struct cpio_data *cp) return false; } -static void __init find_blobs_in_containers(struct cpio_data *ret) +static bool __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; + bool found; if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); - *ret = cp; + found = cp.data && cp.size; + if (found) + *ret = cp; + + return found; } void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) @@ -591,8 +596,7 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(&cp); - if (!(cp.data && cp.size)) + if (!find_blobs_in_containers(&cp)) return; if (early_apply_microcode(ed->old_rev, cp.data, cp.size)) @@ -612,8 +616,7 @@ static int __init save_microcode_in_initrd(void) if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; - find_blobs_in_containers(&cp); - if (!(cp.data && cp.size)) + if (!find_blobs_in_containers(&cp)) return -EINVAL; scan_containers(cp.data, cp.size, &desc); From db80b2efa0377bf6e7d422fd7e6605481b3a0ee4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Oct 2024 18:51:51 +0300 Subject: [PATCH 189/224] x86/microcode/AMD: Remove bogus comment from parse_container() The function doesn't return an equivalence ID, remove the false comment. Signed-off-by: Nikolay Borisov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241018155151.702350-4-nik.borisov@suse.com --- arch/x86/kernel/cpu/microcode/amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index d395665d9691..95431e4d9fae 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -381,8 +381,8 @@ static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) /* * This scans the ucode blob for the proper container as we can have multiple - * containers glued together. Returns the equivalence ID from the equivalence - * table or 0 if none found. + * containers glued together. + * * Returns the amount of bytes consumed while scanning. @desc contains all the * data we're going to use in later stages of the application. */ From d8317f3d8e6b412ff51ea66f1de2b2f89835f811 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Oct 2024 18:51:50 +0300 Subject: [PATCH 190/224] x86/microcode/AMD: Make __verify_patch_size() return bool The result of that function is in essence boolean, so simplify to return the result of the relevant expression. It also makes it follow the convention used by __verify_patch_section(). No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241018155151.702350-3-nik.borisov@suse.com --- arch/x86/kernel/cpu/microcode/amd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 95431e4d9fae..9a5ebbbc6542 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -283,13 +283,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) +static bool __verify_patch_size(u32 sh_psize, size_t buf_size) { u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) - return min_t(u32, sh_psize, buf_size); + goto ret; #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 @@ -303,13 +303,15 @@ static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) break; default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); - return 0; + return false; } - if (sh_psize > min_t(u32, buf_size, max_size)) - return 0; + if (sh_psize > max_size) + return false; - return sh_psize; +ret: + /* Working with the whole buffer so < is ok. */ + return sh_psize <= buf_size; } /* @@ -324,7 +326,6 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; - unsigned int ret; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -348,8 +349,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) return -1; } - ret = __verify_patch_size(sh_psize, buf_size); - if (!ret) { + if (!__verify_patch_size(sh_psize, buf_size)) { pr_debug("Per-family patch size mismatch.\n"); return -1; } From 78e0aadbd4c6807a06a9d25bc190fe515d3f3c42 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 18 Nov 2024 17:17:24 +0100 Subject: [PATCH 191/224] x86/microcode/AMD: Have __apply_microcode_amd() return bool This is the natural thing to do anyway. No functional changes. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/microcode/amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9a5ebbbc6542..ac3fd07e36ac 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -484,7 +484,7 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) } } -static int __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize) +static bool __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize) { unsigned long p_addr = (unsigned long)&mc->hdr.data_code; u32 rev, dummy; @@ -508,9 +508,9 @@ static int __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev != mc->hdr.patch_id) - return -1; + return false; - return 0; + return true; } /* @@ -544,7 +544,7 @@ static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) if (old_rev > mc->hdr.patch_id) return ret; - return !__apply_microcode_amd(mc, desc.psize); + return __apply_microcode_amd(mc, desc.psize); } static bool get_builtin_microcode(struct cpio_data *cp) @@ -763,7 +763,7 @@ void reload_ucode_amd(unsigned int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc, p->size)) + if (__apply_microcode_amd(mc, p->size)) pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); } } @@ -816,7 +816,7 @@ static enum ucode_state apply_microcode_amd(int cpu) goto out; } - if (__apply_microcode_amd(mc_amd, p->size)) { + if (!__apply_microcode_amd(mc_amd, p->size)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; From ead0db14c7266c34b1f8a6db6e15e2f4100a9e9e Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 31 Dec 2024 13:58:56 +0100 Subject: [PATCH 192/224] x86/microcode/AMD: Remove ret local var in early_apply_microcode() No functional changes. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/microcode/amd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index ac3fd07e36ac..a5dac7f3c0a0 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -528,13 +528,12 @@ static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; - bool ret = false; scan_containers(ucode, size, &desc); mc = desc.mc; if (!mc) - return ret; + return false; /* * Allow application of the same revision to pick up SMT-specific @@ -542,7 +541,7 @@ static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) * up-to-date. */ if (old_rev > mc->hdr.patch_id) - return ret; + return false; return __apply_microcode_amd(mc, desc.psize); } From 99b863d2e87210c70354a1c75cc5bcc7a3afdc01 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 1 Jan 2025 12:51:20 +0100 Subject: [PATCH 193/224] x86/sev: Disable UBSAN on SEV code that may execute very early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang 14 and older may emit UBSAN instrumentation into code that is inlined into functions marked with __no_sanitize_undefined¹. This may result in faults when the code is executed very early, which may be the case for functions annotated as __head. Now that this requirement is strictly enforced, the build will fail in this case with the following message Absolute reference to symbol '.data' not permitted in .head.text Work around this by disabling UBSAN instrumentation on all SEV core code. ¹ https://lore.kernel.org/r/20250101024348.GA1828419@ax162 [ bp: Add a footnote with Nathan's detailed explanation and a Fixes tag ] Fixes: 3b6f99a94b04 ("x86/boot: Disable UBSAN in early boot code") Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250101115119.114584-2-ardb@kernel.org --- arch/x86/coco/sev/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 4e375e7305ac..08de37559307 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -13,3 +13,6 @@ KCOV_INSTRUMENT_core.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n + +# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining +UBSAN_SANITIZE := n From 0094014be0cd75273ef7f2934c17fb8cffd4db6e Mon Sep 17 00:00:00 2001 From: Alan Song Date: Mon, 30 Dec 2024 14:57:06 +0800 Subject: [PATCH 194/224] x86/ioapic: Remove a stray tab in the IO-APIC type string The type "physic al" should be "physical". [ bp: Massage commit message. ] Fixes: 54cd3795b471 ("x86/ioapic: Cleanup guarded debug printk()s") Signed-off-by: Alan Song Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241230065706.16789-1-syfmark114@163.com --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1029ea4ac8ba..03062469e1af 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1165,7 +1165,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero); } else { apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf, - entry.dest_mode_logical ? "logical " : "physic al", + entry.dest_mode_logical ? "logical " : "physical", entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode); } } From d35fb3121a36170bba951c529847a630440e4174 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:54 +0000 Subject: [PATCH 195/224] x86/mce/amd: Remove shared threshold bank plumbing Legacy AMD systems include an integrated Northbridge that is represented by MCA bank 4. This is the only non-core MCA bank in legacy systems. The Northbridge is physically shared by all the CPUs within an AMD "Node". However, in practice the "shared" MCA bank can only by managed by a single CPU within that AMD Node. This is known as the "Node Base Core" (NBC). For example, only the NBC will be able to read the MCA bank 4 registers; they will be Read-as-Zero for other CPUs. Also, the MCA Thresholding interrupt will only signal the NBC; the other CPUs will not receive it. This is enforced by hardware, and it should not be managed by software. The current AMD Thresholding code attempts to deal with the "shared" MCA bank by micromanaging the bank's sysfs kobjects. However, this does not follow the intended kobject use cases. It is also fragile, and it has caused bugs in the past. Modern AMD systems do not need this shared MCA bank support, and it should not be needed on legacy systems either. Remove the shared threshold bank code. Also, move the threshold struct definitions to mce/amd.c, since they are no longer needed in amd_nb.c. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-2-yazen.ghannam@amd.com --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/amd_nb.h | 31 --------- arch/x86/kernel/cpu/mce/amd.c | 127 +++++++--------------------------- 3 files changed, 27 insertions(+), 133 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d7bd0ae48c4..e4e27d44dc2b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1189,7 +1189,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC && AMD_NB + depends on X86_MCE && X86_LOCAL_APIC help Additional support for AMD specific MCE features such as the DRAM Error Threshold. diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index d0caac26533f..4f586fc699fd 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -4,7 +4,6 @@ #include #include -#include struct amd_nb_bus_dev_range { u8 bus; @@ -29,41 +28,11 @@ struct amd_l3_cache { u8 subcaches[4]; }; -struct threshold_block { - unsigned int block; /* Number within bank */ - unsigned int bank; /* MCA bank the block belongs to */ - unsigned int cpu; /* CPU which controls MCA bank */ - u32 address; /* MSR address for the block */ - u16 interrupt_enable; /* Enable/Disable APIC interrupt */ - bool interrupt_capable; /* Bank can generate an interrupt. */ - - u16 threshold_limit; /* - * Value upon which threshold - * interrupt is generated. - */ - - struct kobject kobj; /* sysfs object */ - struct list_head miscj; /* - * List of threshold blocks - * within a bank. - */ -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - - /* initialized to the number of CPUs on the node sharing this bank */ - refcount_t cpus; - unsigned int shared; -}; - struct amd_northbridge { struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; - struct threshold_bank *bank4; }; struct amd_northbridge_info { diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 018874b554cb..1075a90141da 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov - * - * All MC4_MISCi registers are shared between cores on a node. */ #include #include @@ -20,7 +18,6 @@ #include #include -#include #include #include #include @@ -221,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -333,19 +356,6 @@ struct thresh_restart { u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -1198,35 +1208,10 @@ out_free: return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1234,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_amd_node_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1267,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } - err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1310,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank) kobject_put(&bank->blocks->kobj); } -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - static void threshold_remove_bank(struct threshold_bank *bank) { - struct amd_northbridge *nb; - if (!bank->blocks) goto out_free; - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: deallocate_threshold_blocks(bank); out_free: From 8234177d2027e52126e40472fe5807f4e94b19a3 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:21 +0530 Subject: [PATCH 196/224] virt: sev-guest: Remove is_vmpck_empty() helper Remove is_vmpck_empty() which uses a local array allocation to check if the VMPCK is empty and replace it with memchr_inv() to directly determine if the VMPCK is empty without additional memory allocation. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-2-nikunj@amd.com --- drivers/virt/coco/sev-guest/sev-guest.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index b699771be029..62328d0b2cb6 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -63,16 +63,6 @@ MODULE_PARM_DESC(vmpck_id, "The VMPCK ID to use when communicating with the PSP. /* Mutex to serialize the shared buffer access and command handling. */ static DEFINE_MUTEX(snp_cmd_mutex); -static bool is_vmpck_empty(struct snp_msg_desc *mdesc) -{ - char zero_key[VMPCK_KEY_LEN] = {0}; - - if (mdesc->vmpck) - return !memcmp(mdesc->vmpck, zero_key, VMPCK_KEY_LEN); - - return true; -} - /* * If an error is received from the host or AMD Secure Processor (ASP) there * are two options. Either retry the exact same encrypted request or discontinue @@ -335,7 +325,7 @@ static int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r guard(mutex)(&snp_cmd_mutex); /* Check if the VMPCK is not empty */ - if (is_vmpck_empty(mdesc)) { + if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { pr_err_ratelimited("VMPCK is disabled\n"); return -ENOTTY; } @@ -1024,7 +1014,7 @@ static int __init sev_guest_probe(struct platform_device *pdev) } /* Verify that VMPCK is not zero. */ - if (is_vmpck_empty(mdesc)) { + if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { dev_err(dev, "Empty VMPCK%d communication key\n", vmpck_id); goto e_unmap; } From 864884a0c29cc610a859b5210158112fd8675fe1 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:22 +0530 Subject: [PATCH 197/224] virt: sev-guest: Replace GFP_KERNEL_ACCOUNT with GFP_KERNEL Replace GFP_KERNEL_ACCOUNT with GFP_KERNEL in the sev-guest driver code. GFP_KERNEL_ACCOUNT is typically used for accounting untrusted userspace allocations. After auditing the sev-guest code, the following changes are necessary: * snp_init_crypto(): Use GFP_KERNEL as this is a trusted device probe path. Retain GFP_KERNEL_ACCOUNT in the following cases for robustness and specific path requirements: * alloc_shared_pages(): Although all allocations are limited, retain GFP_KERNEL_ACCOUNT for future robustness. * get_report() and get_ext_report(): These functions are on the unlocked ioctl path and should continue using GFP_KERNEL_ACCOUNT. Suggested-by: Borislav Petkov Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-3-nikunj@amd.com --- drivers/virt/coco/sev-guest/sev-guest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 62328d0b2cb6..250ce92d816b 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -141,7 +141,7 @@ static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen) { struct aesgcm_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; From c5529418d05079384af4dbbb6f6156344c2ffce2 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:23 +0530 Subject: [PATCH 198/224] x86/sev: Carve out and export SNP guest messaging init routines Currently, the sev-guest driver is the only user of SNP guest messaging. All routines for initializing SNP guest messaging are implemented within the sev-guest driver and are not available during early boot. In preparation for adding Secure TSC guest support, carve out APIs to allocate and initialize the guest messaging descriptor context and make it part of coco/sev/core.c. As there is no user of sev_guest_platform_data anymore, remove the structure. Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250106124633.1418972-4-nikunj@amd.com --- arch/x86/Kconfig | 1 + arch/x86/coco/sev/core.c | 183 ++++++++++++++++++++++- arch/x86/include/asm/sev.h | 13 +- drivers/virt/coco/sev-guest/Kconfig | 1 - drivers/virt/coco/sev-guest/sev-guest.c | 185 +++--------------------- 5 files changed, 208 insertions(+), 175 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d7bd0ae48c4..0f7e3acf37e3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1559,6 +1559,7 @@ config AMD_MEM_ENCRYPT select ARCH_HAS_CC_PLATFORM select X86_MEM_ENCRYPT select UNACCEPTED_MEMORY + select CRYPTO_LIB_AESGCM help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..30ce563bf38a 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -2580,15 +2581,9 @@ static struct platform_device sev_guest_device = { static int __init snp_init_platform_device(void) { - struct sev_guest_platform_data data; - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return -ENODEV; - data.secrets_gpa = secrets_pa; - if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) - return -ENODEV; - if (platform_device_register(&sev_guest_device)) return -ENODEV; @@ -2667,3 +2662,179 @@ static int __init sev_sysfs_init(void) } arch_initcall(sev_sysfs_init); #endif // CONFIG_SYSFS + +static void free_shared_pages(void *buf, size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + int ret; + + if (!buf) + return; + + ret = set_memory_encrypted((unsigned long)buf, npages); + if (ret) { + WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); + return; + } + + __free_pages(virt_to_page(buf), get_order(sz)); +} + +static void *alloc_shared_pages(size_t sz) +{ + unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + struct page *page; + int ret; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz)); + if (!page) + return NULL; + + ret = set_memory_decrypted((unsigned long)page_address(page), npages); + if (ret) { + pr_err("failed to mark page shared, ret=%d\n", ret); + __free_pages(page, get_order(sz)); + return NULL; + } + + return page_address(page); +} + +static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno) +{ + u8 *key = NULL; + + switch (id) { + case 0: + *seqno = &secrets->os_area.msg_seqno_0; + key = secrets->vmpck0; + break; + case 1: + *seqno = &secrets->os_area.msg_seqno_1; + key = secrets->vmpck1; + break; + case 2: + *seqno = &secrets->os_area.msg_seqno_2; + key = secrets->vmpck2; + break; + case 3: + *seqno = &secrets->os_area.msg_seqno_3; + key = secrets->vmpck3; + break; + default: + break; + } + + return key; +} + +static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen) +{ + struct aesgcm_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) { + pr_err("Crypto context initialization failed\n"); + kfree(ctx); + return NULL; + } + + return ctx; +} + +int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) +{ + /* Adjust the default VMPCK key based on the executing VMPL level */ + if (vmpck_id == -1) + vmpck_id = snp_vmpl; + + mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno); + if (!mdesc->vmpck) { + pr_err("Invalid VMPCK%d communication key\n", vmpck_id); + return -EINVAL; + } + + /* Verify that VMPCK is not zero. */ + if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { + pr_err("Empty VMPCK%d communication key\n", vmpck_id); + return -EINVAL; + } + + mdesc->vmpck_id = vmpck_id; + + mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN); + if (!mdesc->ctx) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(snp_msg_init); + +struct snp_msg_desc *snp_msg_alloc(void) +{ + struct snp_msg_desc *mdesc; + void __iomem *mem; + + BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE); + + mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL); + if (!mdesc) + return ERR_PTR(-ENOMEM); + + mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); + if (!mem) + goto e_free_mdesc; + + mdesc->secrets = (__force struct snp_secrets_page *)mem; + + /* Allocate the shared page used for the request and response message. */ + mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg)); + if (!mdesc->request) + goto e_unmap; + + mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg)); + if (!mdesc->response) + goto e_free_request; + + mdesc->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE); + if (!mdesc->certs_data) + goto e_free_response; + + /* initial the input address for guest request */ + mdesc->input.req_gpa = __pa(mdesc->request); + mdesc->input.resp_gpa = __pa(mdesc->response); + mdesc->input.data_gpa = __pa(mdesc->certs_data); + + return mdesc; + +e_free_response: + free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); +e_free_request: + free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); +e_unmap: + iounmap(mem); +e_free_mdesc: + kfree(mdesc); + + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(snp_msg_alloc); + +void snp_msg_free(struct snp_msg_desc *mdesc) +{ + if (!mdesc) + return; + + kfree(mdesc->ctx); + free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); + free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); + free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); + iounmap((__force void __iomem *)mdesc->secrets); + + memset(mdesc, 0, sizeof(*mdesc)); + kfree(mdesc); +} +EXPORT_SYMBOL_GPL(snp_msg_free); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 91f08af31078..db08d0ac90be 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -14,6 +14,7 @@ #include #include #include +#include #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -170,10 +171,6 @@ struct snp_guest_msg { u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; } __packed; -struct sev_guest_platform_data { - u64 secrets_gpa; -}; - struct snp_guest_req { void *req_buf; size_t req_sz; @@ -253,6 +250,7 @@ struct snp_msg_desc { u32 *os_area_msg_seqno; u8 *vmpck; + int vmpck_id; }; /* @@ -458,6 +456,10 @@ void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); void snp_kexec_begin(void); +int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id); +struct snp_msg_desc *snp_msg_alloc(void); +void snp_msg_free(struct snp_msg_desc *mdesc); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 @@ -498,6 +500,9 @@ static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } static inline void snp_kexec_begin(void) { } +static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; } +static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } +static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/drivers/virt/coco/sev-guest/Kconfig b/drivers/virt/coco/sev-guest/Kconfig index 0b772bd921d8..a6405ab6c2c3 100644 --- a/drivers/virt/coco/sev-guest/Kconfig +++ b/drivers/virt/coco/sev-guest/Kconfig @@ -2,7 +2,6 @@ config SEV_GUEST tristate "AMD SEV Guest driver" default m depends on AMD_MEM_ENCRYPT - select CRYPTO_LIB_AESGCM select TSM_REPORTS help SEV-SNP firmware provides the guest a mechanism to communicate with diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 250ce92d816b..d0f7233b1430 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -83,7 +83,7 @@ static DEFINE_MUTEX(snp_cmd_mutex); static void snp_disable_vmpck(struct snp_msg_desc *mdesc) { pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n", - vmpck_id); + mdesc->vmpck_id); memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN); mdesc->vmpck = NULL; } @@ -137,23 +137,6 @@ static inline struct snp_guest_dev *to_snp_dev(struct file *file) return container_of(dev, struct snp_guest_dev, misc); } -static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen) -{ - struct aesgcm_ctx *ctx; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return NULL; - - if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) { - pr_err("Crypto context initialization failed\n"); - kfree(ctx); - return NULL; - } - - return ctx; -} - static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req) { struct snp_guest_msg *resp_msg = &mdesc->secret_response; @@ -404,7 +387,7 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io req.msg_version = arg->msg_version; req.msg_type = SNP_MSG_REPORT_REQ; - req.vmpck_id = vmpck_id; + req.vmpck_id = mdesc->vmpck_id; req.req_buf = report_req; req.req_sz = sizeof(*report_req); req.resp_buf = report_resp->data; @@ -451,7 +434,7 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque req.msg_version = arg->msg_version; req.msg_type = SNP_MSG_KEY_REQ; - req.vmpck_id = vmpck_id; + req.vmpck_id = mdesc->vmpck_id; req.req_buf = derived_key_req; req.req_sz = sizeof(*derived_key_req); req.resp_buf = buf; @@ -529,7 +512,7 @@ cmd: req.msg_version = arg->msg_version; req.msg_type = SNP_MSG_REPORT_REQ; - req.vmpck_id = vmpck_id; + req.vmpck_id = mdesc->vmpck_id; req.req_buf = &report_req->data; req.req_sz = sizeof(report_req->data); req.resp_buf = report_resp->data; @@ -606,76 +589,11 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long return ret; } -static void free_shared_pages(void *buf, size_t sz) -{ - unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; - int ret; - - if (!buf) - return; - - ret = set_memory_encrypted((unsigned long)buf, npages); - if (ret) { - WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); - return; - } - - __free_pages(virt_to_page(buf), get_order(sz)); -} - -static void *alloc_shared_pages(struct device *dev, size_t sz) -{ - unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; - struct page *page; - int ret; - - page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz)); - if (!page) - return NULL; - - ret = set_memory_decrypted((unsigned long)page_address(page), npages); - if (ret) { - dev_err(dev, "failed to mark page shared, ret=%d\n", ret); - __free_pages(page, get_order(sz)); - return NULL; - } - - return page_address(page); -} - static const struct file_operations snp_guest_fops = { .owner = THIS_MODULE, .unlocked_ioctl = snp_guest_ioctl, }; -static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno) -{ - u8 *key = NULL; - - switch (id) { - case 0: - *seqno = &secrets->os_area.msg_seqno_0; - key = secrets->vmpck0; - break; - case 1: - *seqno = &secrets->os_area.msg_seqno_1; - key = secrets->vmpck1; - break; - case 2: - *seqno = &secrets->os_area.msg_seqno_2; - key = secrets->vmpck2; - break; - case 3: - *seqno = &secrets->os_area.msg_seqno_3; - key = secrets->vmpck3; - break; - default: - break; - } - - return key; -} - struct snp_msg_report_resp_hdr { u32 status; u32 report_size; @@ -969,13 +887,10 @@ static void unregister_sev_tsm(void *data) static int __init sev_guest_probe(struct platform_device *pdev) { - struct sev_guest_platform_data *data; - struct snp_secrets_page *secrets; struct device *dev = &pdev->dev; struct snp_guest_dev *snp_dev; struct snp_msg_desc *mdesc; struct miscdevice *misc; - void __iomem *mapping; int ret; BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE); @@ -983,115 +898,57 @@ static int __init sev_guest_probe(struct platform_device *pdev) if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return -ENODEV; - if (!dev->platform_data) - return -ENODEV; - - data = (struct sev_guest_platform_data *)dev->platform_data; - mapping = ioremap_encrypted(data->secrets_gpa, PAGE_SIZE); - if (!mapping) - return -ENODEV; - - secrets = (__force void *)mapping; - - ret = -ENOMEM; snp_dev = devm_kzalloc(&pdev->dev, sizeof(struct snp_guest_dev), GFP_KERNEL); if (!snp_dev) - goto e_unmap; + return -ENOMEM; - mdesc = devm_kzalloc(&pdev->dev, sizeof(struct snp_msg_desc), GFP_KERNEL); - if (!mdesc) - goto e_unmap; + mdesc = snp_msg_alloc(); + if (IS_ERR_OR_NULL(mdesc)) + return -ENOMEM; - /* Adjust the default VMPCK key based on the executing VMPL level */ - if (vmpck_id == -1) - vmpck_id = snp_vmpl; - - ret = -EINVAL; - mdesc->vmpck = get_vmpck(vmpck_id, secrets, &mdesc->os_area_msg_seqno); - if (!mdesc->vmpck) { - dev_err(dev, "Invalid VMPCK%d communication key\n", vmpck_id); - goto e_unmap; - } - - /* Verify that VMPCK is not zero. */ - if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { - dev_err(dev, "Empty VMPCK%d communication key\n", vmpck_id); - goto e_unmap; - } + ret = snp_msg_init(mdesc, vmpck_id); + if (ret) + goto e_msg_init; platform_set_drvdata(pdev, snp_dev); snp_dev->dev = dev; - mdesc->secrets = secrets; - - /* Allocate the shared page used for the request and response message. */ - mdesc->request = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); - if (!mdesc->request) - goto e_unmap; - - mdesc->response = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); - if (!mdesc->response) - goto e_free_request; - - mdesc->certs_data = alloc_shared_pages(dev, SEV_FW_BLOB_MAX_SIZE); - if (!mdesc->certs_data) - goto e_free_response; - - ret = -EIO; - mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN); - if (!mdesc->ctx) - goto e_free_cert_data; misc = &snp_dev->misc; misc->minor = MISC_DYNAMIC_MINOR; misc->name = DEVICE_NAME; misc->fops = &snp_guest_fops; - /* Initialize the input addresses for guest request */ - mdesc->input.req_gpa = __pa(mdesc->request); - mdesc->input.resp_gpa = __pa(mdesc->response); - mdesc->input.data_gpa = __pa(mdesc->certs_data); - /* Set the privlevel_floor attribute based on the vmpck_id */ - sev_tsm_ops.privlevel_floor = vmpck_id; + sev_tsm_ops.privlevel_floor = mdesc->vmpck_id; ret = tsm_register(&sev_tsm_ops, snp_dev); if (ret) - goto e_free_cert_data; + goto e_msg_init; ret = devm_add_action_or_reset(&pdev->dev, unregister_sev_tsm, NULL); if (ret) - goto e_free_cert_data; + goto e_msg_init; ret = misc_register(misc); if (ret) - goto e_free_ctx; + goto e_msg_init; snp_dev->msg_desc = mdesc; - dev_info(dev, "Initialized SEV guest driver (using VMPCK%d communication key)\n", vmpck_id); + dev_info(dev, "Initialized SEV guest driver (using VMPCK%d communication key)\n", + mdesc->vmpck_id); return 0; -e_free_ctx: - kfree(mdesc->ctx); -e_free_cert_data: - free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); -e_free_response: - free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); -e_free_request: - free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); -e_unmap: - iounmap(mapping); +e_msg_init: + snp_msg_free(mdesc); + return ret; } static void __exit sev_guest_remove(struct platform_device *pdev) { struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev); - struct snp_msg_desc *mdesc = snp_dev->msg_desc; - free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); - free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); - free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); - kfree(mdesc->ctx); + snp_msg_free(snp_dev->msg_desc); misc_deregister(&snp_dev->misc); } From 1e0b23b5d2d18b2bd2c66d8214072d700a8c350d Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:24 +0530 Subject: [PATCH 199/224] x86/sev: Relocate SNP guest messaging routines to common code At present, the SEV guest driver exclusively handles SNP guest messaging. All routines for sending guest messages are embedded within it. To support Secure TSC, SEV-SNP guests must communicate with the AMD Security Processor during early boot. However, these guest messaging functions are not accessible during early boot since they are currently part of the guest driver. Hence, relocate the core SNP guest messaging functions to SEV common code and provide an API for sending SNP guest messages. No functional change, but just an export symbol added for snp_send_guest_request() and dropped the export symbol on snp_issue_guest_request() and made it static. Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250106124633.1418972-5-nikunj@amd.com --- arch/x86/coco/sev/core.c | 294 +++++++++++++++++++++++- arch/x86/include/asm/sev.h | 14 +- drivers/virt/coco/sev-guest/sev-guest.c | 292 ----------------------- 3 files changed, 298 insertions(+), 302 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 30ce563bf38a..ad3a28845817 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -2509,8 +2509,8 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, } EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req); -int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio) +static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, + struct snp_guest_request_ioctl *rio) { struct ghcb_state state; struct es_em_ctxt ctxt; @@ -2572,7 +2572,6 @@ e_restore_irq: return ret; } -EXPORT_SYMBOL_GPL(snp_issue_guest_request); static struct platform_device sev_guest_device = { .name = "sev-guest", @@ -2838,3 +2837,292 @@ void snp_msg_free(struct snp_msg_desc *mdesc) kfree(mdesc); } EXPORT_SYMBOL_GPL(snp_msg_free); + +/* Mutex to serialize the shared buffer access and command handling. */ +static DEFINE_MUTEX(snp_cmd_mutex); + +/* + * If an error is received from the host or AMD Secure Processor (ASP) there + * are two options. Either retry the exact same encrypted request or discontinue + * using the VMPCK. + * + * This is because in the current encryption scheme GHCB v2 uses AES-GCM to + * encrypt the requests. The IV for this scheme is the sequence number. GCM + * cannot tolerate IV reuse. + * + * The ASP FW v1.51 only increments the sequence numbers on a successful + * guest<->ASP back and forth and only accepts messages at its exact sequence + * number. + * + * So if the sequence number were to be reused the encryption scheme is + * vulnerable. If the sequence number were incremented for a fresh IV the ASP + * will reject the request. + */ +static void snp_disable_vmpck(struct snp_msg_desc *mdesc) +{ + pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n", + mdesc->vmpck_id); + memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN); + mdesc->vmpck = NULL; +} + +static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc) +{ + u64 count; + + lockdep_assert_held(&snp_cmd_mutex); + + /* Read the current message sequence counter from secrets pages */ + count = *mdesc->os_area_msg_seqno; + + return count + 1; +} + +/* Return a non-zero on success */ +static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc) +{ + u64 count = __snp_get_msg_seqno(mdesc); + + /* + * The message sequence counter for the SNP guest request is a 64-bit + * value but the version 2 of GHCB specification defines a 32-bit storage + * for it. If the counter exceeds the 32-bit value then return zero. + * The caller should check the return value, but if the caller happens to + * not check the value and use it, then the firmware treats zero as an + * invalid number and will fail the message request. + */ + if (count >= UINT_MAX) { + pr_err("request message sequence counter overflow\n"); + return 0; + } + + return count; +} + +static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc) +{ + /* + * The counter is also incremented by the PSP, so increment it by 2 + * and save in secrets page. + */ + *mdesc->os_area_msg_seqno += 2; +} + +static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req) +{ + struct snp_guest_msg *resp_msg = &mdesc->secret_response; + struct snp_guest_msg *req_msg = &mdesc->secret_request; + struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr; + struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr; + struct aesgcm_ctx *ctx = mdesc->ctx; + u8 iv[GCM_AES_IV_SIZE] = {}; + + pr_debug("response [seqno %lld type %d version %d sz %d]\n", + resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version, + resp_msg_hdr->msg_sz); + + /* Copy response from shared memory to encrypted memory. */ + memcpy(resp_msg, mdesc->response, sizeof(*resp_msg)); + + /* Verify that the sequence counter is incremented by 1 */ + if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1))) + return -EBADMSG; + + /* Verify response message type and version number. */ + if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) || + resp_msg_hdr->msg_version != req_msg_hdr->msg_version) + return -EBADMSG; + + /* + * If the message size is greater than our buffer length then return + * an error. + */ + if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz)) + return -EBADMSG; + + /* Decrypt the payload */ + memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno))); + if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz, + &resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag)) + return -EBADMSG; + + return 0; +} + +static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req) +{ + struct snp_guest_msg *msg = &mdesc->secret_request; + struct snp_guest_msg_hdr *hdr = &msg->hdr; + struct aesgcm_ctx *ctx = mdesc->ctx; + u8 iv[GCM_AES_IV_SIZE] = {}; + + memset(msg, 0, sizeof(*msg)); + + hdr->algo = SNP_AEAD_AES_256_GCM; + hdr->hdr_version = MSG_HDR_VER; + hdr->hdr_sz = sizeof(*hdr); + hdr->msg_type = req->msg_type; + hdr->msg_version = req->msg_version; + hdr->msg_seqno = seqno; + hdr->msg_vmpck = req->vmpck_id; + hdr->msg_sz = req->req_sz; + + /* Verify the sequence number is non-zero */ + if (!hdr->msg_seqno) + return -ENOSR; + + pr_debug("request [seqno %lld type %d version %d sz %d]\n", + hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz); + + if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload))) + return -EBADMSG; + + memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno))); + aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo, + AAD_LEN, iv, hdr->authtag); + + return 0; +} + +static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) +{ + unsigned long req_start = jiffies; + unsigned int override_npages = 0; + u64 override_err = 0; + int rc; + +retry_request: + /* + * Call firmware to process the request. In this function the encrypted + * message enters shared memory with the host. So after this call the + * sequence number must be incremented or the VMPCK must be deleted to + * prevent reuse of the IV. + */ + rc = snp_issue_guest_request(req, &mdesc->input, rio); + switch (rc) { + case -ENOSPC: + /* + * If the extended guest request fails due to having too + * small of a certificate data buffer, retry the same + * guest request without the extended data request in + * order to increment the sequence number and thus avoid + * IV reuse. + */ + override_npages = mdesc->input.data_npages; + req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + /* + * Override the error to inform callers the given extended + * request buffer size was too small and give the caller the + * required buffer size. + */ + override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN); + + /* + * If this call to the firmware succeeds, the sequence number can + * be incremented allowing for continued use of the VMPCK. If + * there is an error reflected in the return value, this value + * is checked further down and the result will be the deletion + * of the VMPCK and the error code being propagated back to the + * user as an ioctl() return code. + */ + goto retry_request; + + /* + * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been + * throttled. Retry in the driver to avoid returning and reusing the + * message sequence number on a different message. + */ + case -EAGAIN: + if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) { + rc = -ETIMEDOUT; + break; + } + schedule_timeout_killable(SNP_REQ_RETRY_DELAY); + goto retry_request; + } + + /* + * Increment the message sequence number. There is no harm in doing + * this now because decryption uses the value stored in the response + * structure and any failure will wipe the VMPCK, preventing further + * use anyway. + */ + snp_inc_msg_seqno(mdesc); + + if (override_err) { + rio->exitinfo2 = override_err; + + /* + * If an extended guest request was issued and the supplied certificate + * buffer was not large enough, a standard guest request was issued to + * prevent IV reuse. If the standard request was successful, return -EIO + * back to the caller as would have originally been returned. + */ + if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + rc = -EIO; + } + + if (override_npages) + mdesc->input.data_npages = override_npages; + + return rc; +} + +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) +{ + u64 seqno; + int rc; + + guard(mutex)(&snp_cmd_mutex); + + /* Check if the VMPCK is not empty */ + if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { + pr_err_ratelimited("VMPCK is disabled\n"); + return -ENOTTY; + } + + /* Get message sequence and verify that its a non-zero */ + seqno = snp_get_msg_seqno(mdesc); + if (!seqno) + return -EIO; + + /* Clear shared memory's response for the host to populate. */ + memset(mdesc->response, 0, sizeof(struct snp_guest_msg)); + + /* Encrypt the userspace provided payload in mdesc->secret_request. */ + rc = enc_payload(mdesc, seqno, req); + if (rc) + return rc; + + /* + * Write the fully encrypted request to the shared unencrypted + * request page. + */ + memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request)); + + rc = __handle_guest_request(mdesc, req, rio); + if (rc) { + if (rc == -EIO && + rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + return rc; + + pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", + rc, rio->exitinfo2); + + snp_disable_vmpck(mdesc); + return rc; + } + + rc = verify_and_dec_payload(mdesc, req); + if (rc) { + pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc); + snp_disable_vmpck(mdesc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL_GPL(snp_send_guest_request); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index db08d0ac90be..0937ac7a96db 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -125,6 +125,9 @@ struct snp_req_data { #define AAD_LEN 48 #define MSG_HDR_VER 1 +#define SNP_REQ_MAX_RETRY_DURATION (60*HZ) +#define SNP_REQ_RETRY_DELAY (2*HZ) + /* See SNP spec SNP_GUEST_REQUEST section for the structure */ enum msg_type { SNP_MSG_TYPE_INVALID = 0, @@ -443,8 +446,6 @@ void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __noreturn snp_abort(void); void snp_dmi_setup(void); -int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); @@ -459,6 +460,8 @@ void snp_kexec_begin(void); int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id); struct snp_msg_desc *snp_msg_alloc(void); void snp_msg_free(struct snp_msg_desc *mdesc); +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio); #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -482,11 +485,6 @@ static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } -static inline int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio) -{ - return -ENOTTY; -} static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { return -ENOTTY; @@ -503,6 +501,8 @@ static inline void snp_kexec_begin(void) { } static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; } static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } +static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) { return -ENODEV; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index d0f7233b1430..264b6523fe52 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -31,9 +31,6 @@ #define DEVICE_NAME "sev-guest" -#define SNP_REQ_MAX_RETRY_DURATION (60*HZ) -#define SNP_REQ_RETRY_DELAY (2*HZ) - #define SVSM_MAX_RETRIES 3 struct snp_guest_dev { @@ -60,76 +57,6 @@ static int vmpck_id = -1; module_param(vmpck_id, int, 0444); MODULE_PARM_DESC(vmpck_id, "The VMPCK ID to use when communicating with the PSP."); -/* Mutex to serialize the shared buffer access and command handling. */ -static DEFINE_MUTEX(snp_cmd_mutex); - -/* - * If an error is received from the host or AMD Secure Processor (ASP) there - * are two options. Either retry the exact same encrypted request or discontinue - * using the VMPCK. - * - * This is because in the current encryption scheme GHCB v2 uses AES-GCM to - * encrypt the requests. The IV for this scheme is the sequence number. GCM - * cannot tolerate IV reuse. - * - * The ASP FW v1.51 only increments the sequence numbers on a successful - * guest<->ASP back and forth and only accepts messages at its exact sequence - * number. - * - * So if the sequence number were to be reused the encryption scheme is - * vulnerable. If the sequence number were incremented for a fresh IV the ASP - * will reject the request. - */ -static void snp_disable_vmpck(struct snp_msg_desc *mdesc) -{ - pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n", - mdesc->vmpck_id); - memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN); - mdesc->vmpck = NULL; -} - -static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc) -{ - u64 count; - - lockdep_assert_held(&snp_cmd_mutex); - - /* Read the current message sequence counter from secrets pages */ - count = *mdesc->os_area_msg_seqno; - - return count + 1; -} - -/* Return a non-zero on success */ -static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc) -{ - u64 count = __snp_get_msg_seqno(mdesc); - - /* - * The message sequence counter for the SNP guest request is a 64-bit - * value but the version 2 of GHCB specification defines a 32-bit storage - * for it. If the counter exceeds the 32-bit value then return zero. - * The caller should check the return value, but if the caller happens to - * not check the value and use it, then the firmware treats zero as an - * invalid number and will fail the message request. - */ - if (count >= UINT_MAX) { - pr_err("request message sequence counter overflow\n"); - return 0; - } - - return count; -} - -static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc) -{ - /* - * The counter is also incremented by the PSP, so increment it by 2 - * and save in secrets page. - */ - *mdesc->os_area_msg_seqno += 2; -} - static inline struct snp_guest_dev *to_snp_dev(struct file *file) { struct miscdevice *dev = file->private_data; @@ -137,225 +64,6 @@ static inline struct snp_guest_dev *to_snp_dev(struct file *file) return container_of(dev, struct snp_guest_dev, misc); } -static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req) -{ - struct snp_guest_msg *resp_msg = &mdesc->secret_response; - struct snp_guest_msg *req_msg = &mdesc->secret_request; - struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr; - struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr; - struct aesgcm_ctx *ctx = mdesc->ctx; - u8 iv[GCM_AES_IV_SIZE] = {}; - - pr_debug("response [seqno %lld type %d version %d sz %d]\n", - resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version, - resp_msg_hdr->msg_sz); - - /* Copy response from shared memory to encrypted memory. */ - memcpy(resp_msg, mdesc->response, sizeof(*resp_msg)); - - /* Verify that the sequence counter is incremented by 1 */ - if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1))) - return -EBADMSG; - - /* Verify response message type and version number. */ - if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) || - resp_msg_hdr->msg_version != req_msg_hdr->msg_version) - return -EBADMSG; - - /* - * If the message size is greater than our buffer length then return - * an error. - */ - if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz)) - return -EBADMSG; - - /* Decrypt the payload */ - memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno))); - if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz, - &resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag)) - return -EBADMSG; - - return 0; -} - -static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req) -{ - struct snp_guest_msg *msg = &mdesc->secret_request; - struct snp_guest_msg_hdr *hdr = &msg->hdr; - struct aesgcm_ctx *ctx = mdesc->ctx; - u8 iv[GCM_AES_IV_SIZE] = {}; - - memset(msg, 0, sizeof(*msg)); - - hdr->algo = SNP_AEAD_AES_256_GCM; - hdr->hdr_version = MSG_HDR_VER; - hdr->hdr_sz = sizeof(*hdr); - hdr->msg_type = req->msg_type; - hdr->msg_version = req->msg_version; - hdr->msg_seqno = seqno; - hdr->msg_vmpck = req->vmpck_id; - hdr->msg_sz = req->req_sz; - - /* Verify the sequence number is non-zero */ - if (!hdr->msg_seqno) - return -ENOSR; - - pr_debug("request [seqno %lld type %d version %d sz %d]\n", - hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz); - - if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload))) - return -EBADMSG; - - memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno))); - aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo, - AAD_LEN, iv, hdr->authtag); - - return 0; -} - -static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) -{ - unsigned long req_start = jiffies; - unsigned int override_npages = 0; - u64 override_err = 0; - int rc; - -retry_request: - /* - * Call firmware to process the request. In this function the encrypted - * message enters shared memory with the host. So after this call the - * sequence number must be incremented or the VMPCK must be deleted to - * prevent reuse of the IV. - */ - rc = snp_issue_guest_request(req, &mdesc->input, rio); - switch (rc) { - case -ENOSPC: - /* - * If the extended guest request fails due to having too - * small of a certificate data buffer, retry the same - * guest request without the extended data request in - * order to increment the sequence number and thus avoid - * IV reuse. - */ - override_npages = mdesc->input.data_npages; - req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; - - /* - * Override the error to inform callers the given extended - * request buffer size was too small and give the caller the - * required buffer size. - */ - override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN); - - /* - * If this call to the firmware succeeds, the sequence number can - * be incremented allowing for continued use of the VMPCK. If - * there is an error reflected in the return value, this value - * is checked further down and the result will be the deletion - * of the VMPCK and the error code being propagated back to the - * user as an ioctl() return code. - */ - goto retry_request; - - /* - * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been - * throttled. Retry in the driver to avoid returning and reusing the - * message sequence number on a different message. - */ - case -EAGAIN: - if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) { - rc = -ETIMEDOUT; - break; - } - schedule_timeout_killable(SNP_REQ_RETRY_DELAY); - goto retry_request; - } - - /* - * Increment the message sequence number. There is no harm in doing - * this now because decryption uses the value stored in the response - * structure and any failure will wipe the VMPCK, preventing further - * use anyway. - */ - snp_inc_msg_seqno(mdesc); - - if (override_err) { - rio->exitinfo2 = override_err; - - /* - * If an extended guest request was issued and the supplied certificate - * buffer was not large enough, a standard guest request was issued to - * prevent IV reuse. If the standard request was successful, return -EIO - * back to the caller as would have originally been returned. - */ - if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) - rc = -EIO; - } - - if (override_npages) - mdesc->input.data_npages = override_npages; - - return rc; -} - -static int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) -{ - u64 seqno; - int rc; - - guard(mutex)(&snp_cmd_mutex); - - /* Check if the VMPCK is not empty */ - if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) { - pr_err_ratelimited("VMPCK is disabled\n"); - return -ENOTTY; - } - - /* Get message sequence and verify that its a non-zero */ - seqno = snp_get_msg_seqno(mdesc); - if (!seqno) - return -EIO; - - /* Clear shared memory's response for the host to populate. */ - memset(mdesc->response, 0, sizeof(struct snp_guest_msg)); - - /* Encrypt the userspace provided payload in mdesc->secret_request. */ - rc = enc_payload(mdesc, seqno, req); - if (rc) - return rc; - - /* - * Write the fully encrypted request to the shared unencrypted - * request page. - */ - memcpy(mdesc->request, &mdesc->secret_request, - sizeof(mdesc->secret_request)); - - rc = __handle_guest_request(mdesc, req, rio); - if (rc) { - if (rc == -EIO && - rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) - return rc; - - pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", - rc, rio->exitinfo2); - - snp_disable_vmpck(mdesc); - return rc; - } - - rc = verify_and_dec_payload(mdesc, req); - if (rc) { - pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc); - snp_disable_vmpck(mdesc); - return rc; - } - - return 0; -} - struct snp_req_resp { sockptr_t req_data; sockptr_t resp_data; From 893930143440eb5e3ea8f69cb51ab2e61e15c4e1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 6 Jan 2025 16:57:46 +0100 Subject: [PATCH 200/224] x86/sev: Don't hang but terminate on failure to remap SVSM CA Commit 09d35045cd0f ("x86/sev: Avoid WARN()s and panic()s in early boot code") replaced a panic() that could potentially hit before the kernel is even mapped with a deadloop, to ensure that execution does not proceed when the condition in question hits. As Tom suggests, it is better to terminate and return to the hypervisor in this case, using a newly invented failure code to describe the failure condition. Suggested-by: Tom Lendacky Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/all/9ce88603-20ca-e644-2d8a-aeeaf79cde69@amd.com --- arch/x86/coco/sev/core.c | 4 ++-- arch/x86/include/asm/sev-common.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 499b41953e3c..86898547056e 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -2356,8 +2356,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; ret = svsm_perform_call_protocol(&call); - while (ret) - cpu_relax(); /* too early to panic */ + if (ret) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; RIP_REL_REF(boot_svsm_caa_pa) = pa; diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 50f5666938c0..577b64dda8b4 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -206,6 +206,7 @@ struct snp_psc_desc { #define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */ #define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */ #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ +#define GHCB_TERM_SVSM_CA_REMAP_FAIL 10 /* SVSM is present but CA could not be remapped */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) From 85b60ca9ad2c94661acf86a0c11278246cc5ea86 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:25 +0530 Subject: [PATCH 201/224] x86/sev: Add Secure TSC support for SNP guests Add support for Secure TSC in SNP-enabled guests. Secure TSC allows guests to securely use RDTSC/RDTSCP instructions, ensuring that the parameters used cannot be altered by the hypervisor once the guest is launched. Secure TSC-enabled guests need to query TSC information from the AMD Security Processor. This communication channel is encrypted between the AMD Security Processor and the guest, with the hypervisor acting merely as a conduit to deliver the guest messages to the AMD Security Processor. Each message is protected with AEAD (AES-256 GCM). [ bp: Zap a stray newline over amd_cc_platform_has() while at it, simplify CC_ATTR_GUEST_SNP_SECURE_TSC check ] Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-6-nikunj@amd.com --- arch/x86/coco/core.c | 4 +- arch/x86/coco/sev/core.c | 107 ++++++++++++++++++++++++++++++ arch/x86/include/asm/sev-common.h | 1 + arch/x86/include/asm/sev.h | 21 ++++++ arch/x86/include/asm/svm.h | 6 +- arch/x86/mm/mem_encrypt.c | 2 + include/linux/cc_platform.h | 8 +++ 7 files changed, 146 insertions(+), 3 deletions(-) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 0f81f70aca82..9a0ddda3aa69 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -65,7 +65,6 @@ static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr att * up under SME the trampoline area cannot be encrypted, whereas under SEV * the trampoline area must be encrypted. */ - static bool noinstr amd_cc_platform_has(enum cc_attr attr) { #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -97,6 +96,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_GUEST_SNP_SECURE_TSC: + return sev_status & MSR_AMD64_SNP_SECURE_TSC; + case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index ad3a28845817..7458805b8f0e 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -96,6 +96,14 @@ static u64 sev_hv_features __ro_after_init; /* Secrets page physical address from the CC blob */ static u64 secrets_pa __ro_after_init; +/* + * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and + * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated + * across the APs VMSA fields (TSC_SCALE and TSC_OFFSET). + */ +static u64 snp_tsc_scale __ro_after_init; +static u64 snp_tsc_offset __ro_after_init; + /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { struct ghcb ghcb_page; @@ -1277,6 +1285,12 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) vmsa->vmpl = snp_vmpl; vmsa->sev_features = sev_status >> 2; + /* Populate AP's TSC scale/offset to get accurate TSC values. */ + if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) { + vmsa->tsc_scale = snp_tsc_scale; + vmsa->tsc_offset = snp_tsc_offset; + } + /* Switch the page over to a VMSA page now that it is initialized */ ret = snp_set_vmsa(vmsa, caa, apic_id, true); if (ret) { @@ -3126,3 +3140,96 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req return 0; } EXPORT_SYMBOL_GPL(snp_send_guest_request); + +static int __init snp_get_tsc_info(void) +{ + struct snp_guest_request_ioctl *rio; + struct snp_tsc_info_resp *tsc_resp; + struct snp_tsc_info_req *tsc_req; + struct snp_msg_desc *mdesc; + struct snp_guest_req *req; + int rc = -ENOMEM; + + tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL); + if (!tsc_req) + return rc; + + /* + * The intermediate response buffer is used while decrypting the + * response payload. Make sure that it has enough space to cover + * the authtag. + */ + tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL); + if (!tsc_resp) + goto e_free_tsc_req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto e_free_tsc_resp; + + rio = kzalloc(sizeof(*rio), GFP_KERNEL); + if (!rio) + goto e_free_req; + + mdesc = snp_msg_alloc(); + if (IS_ERR_OR_NULL(mdesc)) + goto e_free_rio; + + rc = snp_msg_init(mdesc, snp_vmpl); + if (rc) + goto e_free_mdesc; + + req->msg_version = MSG_HDR_VER; + req->msg_type = SNP_MSG_TSC_INFO_REQ; + req->vmpck_id = snp_vmpl; + req->req_buf = tsc_req; + req->req_sz = sizeof(*tsc_req); + req->resp_buf = (void *)tsc_resp; + req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; + req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + + rc = snp_send_guest_request(mdesc, req, rio); + if (rc) + goto e_request; + + pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n", + __func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset, + tsc_resp->tsc_factor); + + if (!tsc_resp->status) { + snp_tsc_scale = tsc_resp->tsc_scale; + snp_tsc_offset = tsc_resp->tsc_offset; + } else { + pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status); + rc = -EIO; + } + +e_request: + /* The response buffer contains sensitive data, explicitly clear it. */ + memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN); +e_free_mdesc: + snp_msg_free(mdesc); +e_free_rio: + kfree(rio); +e_free_req: + kfree(req); + e_free_tsc_resp: + kfree(tsc_resp); +e_free_tsc_req: + kfree(tsc_req); + + return rc; +} + +void __init snp_secure_tsc_prepare(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) + return; + + if (snp_get_tsc_info()) { + pr_alert("Unable to retrieve Secure TSC info from ASP\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC); + } + + pr_debug("SecureTSC enabled"); +} diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 50f5666938c0..6ef92432a5ce 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -206,6 +206,7 @@ struct snp_psc_desc { #define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */ #define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */ #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ +#define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 0937ac7a96db..bdcdaac4df1c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -146,6 +146,9 @@ enum msg_type { SNP_MSG_VMRK_REQ, SNP_MSG_VMRK_RSP, + SNP_MSG_TSC_INFO_REQ = 17, + SNP_MSG_TSC_INFO_RSP, + SNP_MSG_TYPE_MAX }; @@ -174,6 +177,21 @@ struct snp_guest_msg { u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; } __packed; +#define SNP_TSC_INFO_REQ_SZ 128 + +struct snp_tsc_info_req { + u8 rsvd[SNP_TSC_INFO_REQ_SZ]; +} __packed; + +struct snp_tsc_info_resp { + u32 status; + u32 rsvd1; + u64 tsc_scale; + u64 tsc_offset; + u32 tsc_factor; + u8 rsvd2[100]; +} __packed; + struct snp_guest_req { void *req_buf; size_t req_sz; @@ -463,6 +481,8 @@ void snp_msg_free(struct snp_msg_desc *mdesc); int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, struct snp_guest_request_ioctl *rio); +void __init snp_secure_tsc_prepare(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 @@ -503,6 +523,7 @@ static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, struct snp_guest_request_ioctl *rio) { return -ENODEV; } +static inline void __init snp_secure_tsc_prepare(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 2b59b9951c90..92e18798f197 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -417,7 +417,9 @@ struct sev_es_save_area { u8 reserved_0x298[80]; u32 pkru; u32 tsc_aux; - u8 reserved_0x2f0[24]; + u64 tsc_scale; + u64 tsc_offset; + u8 reserved_0x300[8]; u64 rcx; u64 rdx; u64 rbx; @@ -564,7 +566,7 @@ static inline void __unused_size_checks(void) BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); - BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x300); BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 0a120d85d7bb..95bae74fdab2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -94,6 +94,8 @@ void __init mem_encrypt_init(void) /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); + snp_secure_tsc_prepare(); + print_mem_encrypt_feature_info(); } diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index caa4b4430634..0bf7d33a1048 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -81,6 +81,14 @@ enum cc_attr { */ CC_ATTR_GUEST_SEV_SNP, + /** + * @CC_ATTR_GUEST_SNP_SECURE_TSC: SNP Secure TSC is active. + * + * The platform/OS is running as a guest/virtual machine and actively + * using AMD SEV-SNP Secure TSC feature. + */ + CC_ATTR_GUEST_SNP_SECURE_TSC, + /** * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. * From 0f0502b8865c0a4c402e73aeb0fb406acc19d0d2 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:26 +0530 Subject: [PATCH 202/224] x86/sev: Change TSC MSR behavior for Secure TSC enabled guests Secure TSC enabled guests should not write to the MSR_IA32_TSC (0x10) register as the subsequent TSC value reads are undefined. On AMD, MSR_IA32_TSC is intercepted by the hypervisor by default. MSR_IA32_TSC read/write accesses should not exit to the hypervisor for such guests. Accesses to MSR_IA32_TSC need special handling in the #VC handler for the guests with Secure TSC enabled. Writes to MSR_IA32_TSC should be ignored and flagged once with a warning, and reads of MSR_IA32_TSC should return the result of the RDTSC instruction. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov (AMD) Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250106124633.1418972-7-nikunj@amd.com --- arch/x86/coco/sev/core.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 7458805b8f0e..cd5b9b723755 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1433,6 +1433,34 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) return ES_OK; } +/* + * TSC related accesses should not exit to the hypervisor when a guest is + * executing with Secure TSC enabled, so special handling is required for + * accesses of MSR_IA32_TSC. + */ +static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) +{ + u64 tsc; + + /* + * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC + * to return undefined values, so ignore all writes. + * + * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use + * the value returned by rdtsc_ordered(). + */ + if (write) { + WARN_ONCE(1, "TSC MSR writes are verboten!\n"); + return ES_OK; + } + + tsc = rdtsc_ordered(); + regs->ax = lower_32_bits(tsc); + regs->dx = upper_32_bits(tsc); + + return ES_OK; +} + static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; @@ -1442,8 +1470,17 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) /* Is it a WRMSR? */ write = ctxt->insn.opcode.bytes[1] == 0x30; - if (regs->cx == MSR_SVSM_CAA) + switch (regs->cx) { + case MSR_SVSM_CAA: return __vc_handle_msr_caa(regs, write); + case MSR_IA32_TSC: + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return __vc_handle_secure_tsc_msrs(regs, write); + else + break; + default: + break; + } ghcb_set_rcx(ghcb, regs->cx); if (write) { From 38cc6495cdec18a448b9e1de45fedce4118833a2 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:27 +0530 Subject: [PATCH 203/224] x86/sev: Prevent GUEST_TSC_FREQ MSR interception for Secure TSC enabled guests The hypervisor should not be intercepting GUEST_TSC_FREQ MSR(0xcOO10134) when Secure TSC is enabled. A #VC exception will be generated otherwise. If this should occur and Secure TSC is enabled, terminate guest execution. Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250106124633.1418972-8-nikunj@amd.com --- arch/x86/coco/sev/core.c | 10 +++++++++- arch/x86/include/asm/msr-index.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index cd5b9b723755..106bdeda58c5 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1436,12 +1436,19 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) /* * TSC related accesses should not exit to the hypervisor when a guest is * executing with Secure TSC enabled, so special handling is required for - * accesses of MSR_IA32_TSC. + * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. */ static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) { u64 tsc; + /* + * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. + * Terminate the SNP guest when the interception is enabled. + */ + if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) + return ES_VMM_ERROR; + /* * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC * to return undefined values, so ignore all writes. @@ -1474,6 +1481,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) case MSR_SVSM_CAA: return __vc_handle_msr_caa(regs, write); case MSR_IA32_TSC: + case MSR_AMD64_GUEST_TSC_FREQ: if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(regs, write); else diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3f3e2bc99162..9a71880eec07 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -608,6 +608,7 @@ #define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_AMD_PERF_STATUS 0xc0010063 #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 +#define MSR_AMD64_GUEST_TSC_FREQ 0xc0010134 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD_PPIN_CTL 0xc00102f0 From eef679a4b52e35be3b4a982a7f42bcc16054ec62 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:28 +0530 Subject: [PATCH 204/224] x86/sev: Prevent RDTSC/RDTSCP interception for Secure TSC enabled guests The hypervisor should not be intercepting RDTSC/RDTSCP when Secure TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP instructions are being intercepted. If this should occur and Secure TSC is enabled, guest execution should be terminated as the guest cannot rely on the TSC value provided by the hypervisor. Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Peter Gonda Link: https://lore.kernel.org/r/20250106124633.1418972-9-nikunj@amd.com --- arch/x86/coco/sev/shared.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/coco/sev/shared.c index 71de53194089..4386f37bd31d 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/coco/sev/shared.c @@ -1140,6 +1140,16 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; + /* + * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure + * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP + * instructions are being intercepted. If this should occur and Secure + * TSC is enabled, guest execution should be terminated as the guest + * cannot rely on the TSC value provided by the hypervisor. + */ + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return ES_VMM_ERROR; + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; From 0a2a98f691f2c57db5bb321e68787cb1de29c7dd Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:29 +0530 Subject: [PATCH 205/224] x86/sev: Mark the TSC in a secure TSC guest as reliable In SNP guest environment with Secure TSC enabled, unlike other clock sources (such as HPET, ACPI timer, APIC, etc), the RDTSC instruction is handled without causing a VM exit, resulting in minimal overhead and jitters. Even when the host CPU's TSC is tampered with, the Secure TSC enabled guest keeps on ticking forward. Hence, mark Secure TSC as the only reliable clock source, bypassing unstable calibration. [ bp: Massage. ] Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Peter Gonda Link: https://lore.kernel.org/r/20250106124633.1418972-10-nikunj@amd.com --- arch/x86/mm/mem_encrypt_amd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 774f9677458f..b56c5c073003 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -541,6 +541,9 @@ void __init sme_early_init(void) * kernel mapped. */ snp_update_svsm_ca(); + + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } void __init mem_encrypt_free_decrypted_mem(void) From bee9e840609cc67d0a7d82f22a2130fb7a0a766d Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:55 +0000 Subject: [PATCH 206/224] x86/amd_nb: Restrict init function to AMD-based systems The code implicitly operates on AMD-based systems by matching on PCI IDs. However, the use of these IDs is going away. Add an explicit CPU vendor check instead of relying on PCI IDs. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-3-yazen.ghannam@amd.com --- arch/x86/kernel/amd_nb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 9fe9972d2071..37b8244899d8 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -582,6 +582,10 @@ static __init void fix_erratum_688(void) static __init int init_amd_nbs(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return 0; + amd_cache_northbridges(); amd_cache_gart(); From e13f51b51814e2527c51998d2dae594ef9cb633a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:56 +0000 Subject: [PATCH 207/224] x86/amd_nb: Clean up early_is_amd_nb() The check for early_is_amd_nb() is only useful for systems with GART or the NB_CFG register. Zen-based systems (both AMD and Hygon) have neither, so return early for them. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-4-yazen.ghannam@amd.com --- arch/x86/kernel/amd_nb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 37b8244899d8..ee20071ced99 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -385,7 +385,6 @@ static int amd_cache_northbridges(void) */ bool __init early_is_amd_nb(u32 device) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *id; u32 vendor = device & 0xffff; @@ -393,11 +392,11 @@ bool __init early_is_amd_nb(u32 device) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return false; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - misc_ids = hygon_nb_misc_ids; + if (cpu_feature_enabled(X86_FEATURE_ZEN)) + return false; device >>= 16; - for (id = misc_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return true; return false; From e6e6e5e85116b8587ab2dff7cd6ab3e082859ce7 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:57 +0000 Subject: [PATCH 208/224] x86: Start moving AMD node functionality out of AMD_NB The "AMD Node" concept spans many families of systems and applies to a number of subsystems and drivers. Currently, the AMD Northbridge code is overloaded with AMD node functionality. However, the node concept is broader than just northbridges. Start files to host common AMD node functions and definitions. Include a helper to find an AMD node device function based on the convention described in AMD documentation. Anything that needs node functionality should include this rather than amd_nb.h. The AMD_NB code will be reduced to only northbridge-specific code needed for legacy systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-5-yazen.ghannam@amd.com --- MAINTAINERS | 7 +++++++ arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/amd_node.h | 27 ++++++++++++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/amd_node.c | 34 +++++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+) create mode 100644 arch/x86/include/asm/amd_node.h create mode 100644 arch/x86/kernel/amd_node.c diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..290989ab9f72 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1121,6 +1121,13 @@ L: linux-i2c@vger.kernel.org S: Supported F: drivers/i2c/busses/i2c-amd-asf-plat.c +AMD NODE DRIVER +M: Yazen Ghannam +L: linux-kernel@vger.kernel.org +S: Supported +F: arch/x86/include/asm/amd_node.h +F: arch/x86/kernel/amd_node.c + AMD PDS CORE DRIVER M: Shannon Nelson M: Brett Creeley diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d7bd0ae48c4..01a91b22c05f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -3128,6 +3128,10 @@ config TS5500 endif # X86_32 config AMD_NB + def_bool y + depends on AMD_NODE + +config AMD_NODE def_bool y depends on CPU_SUP_AMD && PCI diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h new file mode 100644 index 000000000000..622bd3038eeb --- /dev/null +++ b/arch/x86/include/asm/amd_node.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Node helper functions and common defines + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + * + * Note: + * Items in this file may only be used in a single place. + * However, it's prudent to keep all AMD Node functionality + * in a unified place rather than spreading throughout the + * kernel. + */ + +#ifndef _ASM_X86_AMD_NODE_H_ +#define _ASM_X86_AMD_NODE_H_ + +#include + +#define MAX_AMD_NUM_NODES 8 +#define AMD_NODE0_PCI_SLOT 0x18 + +struct pci_dev *amd_node_get_func(u16 node, u8 func); + +#endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f7918980667a..b43eb7e384eb 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -119,6 +119,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_AMD_NODE) += amd_node.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c new file mode 100644 index 000000000000..e825cd4426b9 --- /dev/null +++ b/arch/x86/kernel/amd_node.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Node helper functions and common defines + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include + +/* + * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one + * or more nodes per package. + * + * The nodes are software-visible through PCI config space. All nodes are enumerated + * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8 + * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a + * multi-function device. + * + * On legacy systems, these node devices represent integrated Northbridge functionality. + * On Zen-based systems, these node devices represent Data Fabric functionality. + * + * See "Configuration Space Accesses" section in BKDGs or + * "Processor x86 Core" -> "Configuration Space" section in PPRs. + */ +struct pci_dev *amd_node_get_func(u16 node, u8 func) +{ + if (node >= MAX_AMD_NUM_NODES) + return NULL; + + return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); +} From 962f1970a32430ce6c75ea23cbc59d68346481fd Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:58 +0000 Subject: [PATCH 209/224] x86/amd_nb: Simplify function 4 search Use the newly added helper function to look up a CPU/Node function to find "function 4" devices. Thus, avoid the need to regularly add new PCI IDs for basic discovery. The unique PCI IDs are still useful in case of quirks or functional changes. And they should be used only in such a manner. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-6-yazen.ghannam@amd.com --- arch/x86/include/asm/amd_nb.h | 2 +- arch/x86/kernel/amd_nb.c | 66 ++--------------------------------- 2 files changed, 4 insertions(+), 64 deletions(-) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index d0caac26533f..b48dc6975da2 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -4,7 +4,7 @@ #include #include -#include +#include struct amd_nb_bus_dev_range { u8 bus; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index ee20071ced99..7a62c5af2531 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -30,26 +30,6 @@ #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb #define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 -#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 -#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 -#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 -#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 -#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc -#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 -#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4 0x16fc -#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c -#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc -#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 -#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c - /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -73,8 +53,6 @@ static const struct pci_device_id amd_root_ids[] = { {} }; -#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 - static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -107,35 +85,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = { {} }; -static const struct pci_device_id amd_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, - {} -}; - static const struct pci_device_id hygon_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, {} @@ -146,11 +95,6 @@ static const struct pci_device_id hygon_nb_misc_ids[] = { {} }; -static const struct pci_device_id hygon_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, - {} -}; - const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { 0x00, 0x18, 0x20 }, { 0xff, 0x00, 0x20 }, @@ -275,13 +219,11 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); - static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; - const struct pci_device_id *link_ids = amd_nb_link_ids; const struct pci_device_id *root_ids = amd_root_ids; - struct pci_dev *root, *misc, *link; + struct pci_dev *root, *misc; struct amd_northbridge *nb; u16 roots_per_misc = 0; u16 misc_count = 0; @@ -294,7 +236,6 @@ static int amd_cache_northbridges(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { root_ids = hygon_root_ids; misc_ids = hygon_nb_misc_ids; - link_ids = hygon_nb_link_ids; } misc = NULL; @@ -328,14 +269,13 @@ static int amd_cache_northbridges(void) amd_northbridges.nb = nb; amd_northbridges.num = misc_count; - link = misc = root = NULL; + misc = root = NULL; for (i = 0; i < amd_northbridges.num; i++) { node_to_amd_nb(i)->root = root = next_northbridge(root, root_ids); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, misc_ids); - node_to_amd_nb(i)->link = link = - next_northbridge(link, link_ids); + node_to_amd_nb(i)->link = amd_node_get_func(i, 4); /* * If there are more PCI root devices than data fabric/ From 40a5f6ffdfc8f8ed0d8c535dfa3733b31c66a88c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:11:59 +0000 Subject: [PATCH 210/224] x86/amd_nb: Simplify root device search The "root" device search was introduced to support SMN access for Zen systems. This device represents a PCIe root complex. It is not the same as the "CPU/node" devices found at slots 0x18-0x1F. There may be multiple PCIe root complexes within an AMD node. Such is the case with server or High-end Desktop (HEDT) systems, etc. Therefore it is not enough to assume "root <-> AMD node" is a 1-to-1 association. Currently, this is handled by skipping "extra" root complexes during the search. However, the hardware provides the PCI bus number of an AMD node's root device. Use the hardware info to get the root device's bus and drop the extra search code and PCI IDs. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-7-yazen.ghannam@amd.com --- arch/x86/include/asm/amd_node.h | 1 + arch/x86/kernel/amd_nb.c | 80 ++------------------------------- arch/x86/kernel/amd_node.c | 56 +++++++++++++++++++++++ 3 files changed, 61 insertions(+), 76 deletions(-) diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h index 622bd3038eeb..3f097dd479f8 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd_node.h @@ -23,5 +23,6 @@ #define AMD_NODE0_PCI_SLOT 0x18 struct pci_dev *amd_node_get_func(u16 node, u8 func); +struct pci_dev *amd_node_get_root(u16 node); #endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 7a62c5af2531..6218a0428c77 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,44 +15,11 @@ #include #include -#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 -#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 -#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 -#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 -#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 -#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 -#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 -#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a -#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 -#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122 -#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb -#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 - /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); static u32 *flush_words; -static const struct pci_device_id amd_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, - {} -}; - static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -85,11 +52,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = { {} }; -static const struct pci_device_id hygon_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, - {} -}; - static const struct pci_device_id hygon_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} @@ -222,19 +184,15 @@ EXPORT_SYMBOL_GPL(amd_smn_write); static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; - const struct pci_device_id *root_ids = amd_root_ids; - struct pci_dev *root, *misc; + struct pci_dev *misc; struct amd_northbridge *nb; - u16 roots_per_misc = 0; u16 misc_count = 0; - u16 root_count = 0; - u16 i, j; + u16 i; if (amd_northbridges.num) return 0; if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - root_ids = hygon_root_ids; misc_ids = hygon_nb_misc_ids; } @@ -245,23 +203,6 @@ static int amd_cache_northbridges(void) if (!misc_count) return -ENODEV; - root = NULL; - while ((root = next_northbridge(root, root_ids))) - root_count++; - - if (root_count) { - roots_per_misc = root_count / misc_count; - - /* - * There should be _exactly_ N roots for each DF/SMN - * interface. - */ - if (!roots_per_misc || (root_count % roots_per_misc)) { - pr_info("Unsupported AMD DF/PCI configuration found\n"); - return -ENODEV; - } - } - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; @@ -269,25 +210,12 @@ static int amd_cache_northbridges(void) amd_northbridges.nb = nb; amd_northbridges.num = misc_count; - misc = root = NULL; + misc = NULL; for (i = 0; i < amd_northbridges.num; i++) { - node_to_amd_nb(i)->root = root = - next_northbridge(root, root_ids); + node_to_amd_nb(i)->root = amd_node_get_root(i); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, misc_ids); node_to_amd_nb(i)->link = amd_node_get_func(i, 4); - - /* - * If there are more PCI root devices than data fabric/ - * system management network interfaces, then the (N) - * PCI roots per DF/SMN interface are functionally the - * same (for DF/SMN access) and N-1 are redundant. N-1 - * PCI roots should be skipped per DF/SMN interface so - * the following DF/SMN interfaces get mapped to - * correct PCI roots. - */ - for (j = 1; j < roots_per_misc; j++) - root = next_northbridge(root, root_ids); } if (amd_gart_present()) diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index e825cd4426b9..4eea8c7d8090 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -32,3 +32,59 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func) return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); } + +#define DF_BLK_INST_CNT 0x040 +#define DF_CFG_ADDR_CNTL_LEGACY 0x084 +#define DF_CFG_ADDR_CNTL_DF4 0xC04 + +#define DF_MAJOR_REVISION GENMASK(27, 24) + +static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0) +{ + u32 reg; + + /* + * Revision fields added for DF4 and later. + * + * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. + */ + if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®)) + return 0; + + if (reg & DF_MAJOR_REVISION) + return DF_CFG_ADDR_CNTL_DF4; + + return DF_CFG_ADDR_CNTL_LEGACY; +} + +struct pci_dev *amd_node_get_root(u16 node) +{ + struct pci_dev *root; + u16 cntl_off; + u8 bus; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return NULL; + + /* + * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl) + * Bits [7:0] (SecBusNum) holds the bus number of the root device for + * this Data Fabric instance. The segment, device, and function will be 0. + */ + struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0); + if (!df_f0) + return NULL; + + cntl_off = get_cfg_addr_cntl_offset(df_f0); + if (!cntl_off) + return NULL; + + if (pci_read_config_byte(df_f0, cntl_off, &bus)) + return NULL; + + /* Grab the pointer for the actual root device instance. */ + root = pci_get_domain_bus_and_slot(0, bus, 0); + + pci_dbg(root, "is root for AMD node %u\n", node); + return root; +} From bc7b2e629e0c9251ba96d864a30d34d1497b1b1b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 7 Jan 2025 22:28:41 +0000 Subject: [PATCH 211/224] x86/amd_nb: Use topology info to get AMD node count Currently, the total AMD node count is determined by searching and counting CPU/node devices using PCI IDs. However, AMD node information is already available through topology CPUID/MSRs. The recent topology rework has made this info easier to access. Replace the node counting code with a simple product of topology info. Every node/northbridge is expected to have a 'misc' device. Clear everything out if a 'misc' device isn't found on a node. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250107222847.3300430-7-yazen.ghannam@amd.com --- arch/x86/include/asm/amd_node.h | 5 +++++ arch/x86/kernel/amd_nb.c | 22 +++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h index 3f097dd479f8..419a0ad13ef2 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd_node.h @@ -25,4 +25,9 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func); struct pci_dev *amd_node_get_root(u16 node); +static inline u16 amd_num_nodes(void) +{ + return topology_amd_nodes_per_pkg() * topology_max_packages(); +} + #endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6218a0428c77..6371fe96b988 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -186,7 +186,6 @@ static int amd_cache_northbridges(void) const struct pci_device_id *misc_ids = amd_nb_misc_ids; struct pci_dev *misc; struct amd_northbridge *nb; - u16 misc_count = 0; u16 i; if (amd_northbridges.num) @@ -196,25 +195,30 @@ static int amd_cache_northbridges(void) misc_ids = hygon_nb_misc_ids; } - misc = NULL; - while ((misc = next_northbridge(misc, misc_ids))) - misc_count++; + amd_northbridges.num = amd_num_nodes(); - if (!misc_count) - return -ENODEV; - - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; - amd_northbridges.num = misc_count; misc = NULL; for (i = 0; i < amd_northbridges.num; i++) { node_to_amd_nb(i)->root = amd_node_get_root(i); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, misc_ids); + + /* + * Each Northbridge must have a 'misc' device. + * If not, then uninitialize everything. + */ + if (!node_to_amd_nb(i)->misc) { + amd_northbridges.num = 0; + kfree(nb); + return -ENODEV; + } + node_to_amd_nb(i)->link = amd_node_get_func(i, 4); } From 49b96fc0dddc7b3a01c6707fcaad06fc520402ac Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 7 Jan 2025 22:28:42 +0000 Subject: [PATCH 212/224] x86/amd_nb: Simplify function 3 search Use the newly introduced helper function to look up "function 3". Drop unused PCI IDs and code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250107222847.3300430-8-yazen.ghannam@amd.com --- arch/x86/kernel/amd_nb.c | 46 +--------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6371fe96b988..e335d89ddad7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -29,31 +29,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, - {} -}; - -static const struct pci_device_id hygon_nb_misc_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; @@ -84,17 +59,6 @@ struct amd_northbridge *node_to_amd_nb(int node) } EXPORT_SYMBOL_GPL(node_to_amd_nb); -static struct pci_dev *next_northbridge(struct pci_dev *dev, - const struct pci_device_id *ids) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(ids, dev)); - return dev; -} - /* * SMN accesses may fail in ways that are difficult to detect here in the called * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do @@ -183,18 +147,12 @@ EXPORT_SYMBOL_GPL(amd_smn_write); static int amd_cache_northbridges(void) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; - struct pci_dev *misc; struct amd_northbridge *nb; u16 i; if (amd_northbridges.num) return 0; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - misc_ids = hygon_nb_misc_ids; - } - amd_northbridges.num = amd_num_nodes(); nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); @@ -203,11 +161,9 @@ static int amd_cache_northbridges(void) amd_northbridges.nb = nb; - misc = NULL; for (i = 0; i < amd_northbridges.num; i++) { node_to_amd_nb(i)->root = amd_node_get_root(i); - node_to_amd_nb(i)->misc = misc = - next_northbridge(misc, misc_ids); + node_to_amd_nb(i)->misc = amd_node_get_func(i, 3); /* * Each Northbridge must have a 'misc' device. From 7dd57db495d49c004fffc77265ffbaccf340aa20 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 6 Dec 2024 16:12:02 +0000 Subject: [PATCH 213/224] x86/amd_nb, hwmon: (k10temp): Simplify amd_pci_dev_to_node_id() amd_pci_dev_to_node_id() tries to find the AMD node ID of a device by searching and counting devices. The AMD node ID of an AMD node device is simply its slot number minus the AMD node 0 slot number. Simplify this function and move it to k10temp.c. [ Yazen: Update commit message and simplify function. ] Signed-off-by: Mario Limonciello Co-developed-by: Yazen Ghannam Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20241206161210.163701-10-yazen.ghannam@amd.com --- arch/x86/include/asm/amd_nb.h | 17 ----------------- drivers/hwmon/k10temp.c | 5 +++++ 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index b48dc6975da2..094c3be81a8d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -82,23 +82,6 @@ u16 amd_nb_num(void); bool amd_nb_has_feature(unsigned int feature); struct amd_northbridge *node_to_amd_nb(int node); -static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) -{ - struct pci_dev *misc; - int i; - - for (i = 0; i != amd_nb_num(); i++) { - misc = node_to_amd_nb(i)->misc; - - if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) && - PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn)) - return i; - } - - WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev)); - return 0; -} - static inline bool amd_gart_present(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 7dc19c5d62ac..cefa8cd184c8 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -150,6 +150,11 @@ static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval) F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval); } +static u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) +{ + return PCI_SLOT(pdev->devfn) - AMD_NODE0_PCI_SLOT; +} + static void read_tempreg_nb_zen(struct pci_dev *pdev, u32 *regval) { if (amd_smn_read(amd_pci_dev_to_node_id(pdev), From d6caeafaa324e6aba5ed2ca1a416340c2fd061a2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 6 Dec 2024 16:12:03 +0000 Subject: [PATCH 214/224] x86/amd_nb: Move SMN access code to a new amd_node driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SMN access was bolted into amd_nb mostly as convenience. This has limitations though that require incurring tech debt to keep it working. Move SMN access to the newly introduced AMD Node driver. Signed-off-by: Mario Limonciello Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Acked-by: Ilpo Järvinen # pdx86 Acked-by: Shyam Sundar S K # PMF, PMC Link: https://lore.kernel.org/r/20241206161210.163701-11-yazen.ghannam@amd.com --- MAINTAINERS | 1 + arch/x86/include/asm/amd_nb.h | 3 - arch/x86/include/asm/amd_node.h | 3 + arch/x86/kernel/amd_nb.c | 89 --------------------------- arch/x86/kernel/amd_node.c | 90 ++++++++++++++++++++++++++++ arch/x86/pci/fixup.c | 4 +- drivers/edac/Kconfig | 1 + drivers/edac/amd64_edac.c | 1 + drivers/hwmon/Kconfig | 2 +- drivers/hwmon/k10temp.c | 2 +- drivers/platform/x86/amd/pmc/Kconfig | 2 +- drivers/platform/x86/amd/pmc/pmc.c | 3 +- drivers/platform/x86/amd/pmf/Kconfig | 2 +- drivers/platform/x86/amd/pmf/core.c | 2 +- drivers/ras/amd/atl/Kconfig | 1 + drivers/ras/amd/atl/internal.h | 1 + 16 files changed, 107 insertions(+), 100 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 290989ab9f72..27a5bc2fc49b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1122,6 +1122,7 @@ S: Supported F: drivers/i2c/busses/i2c-amd-asf-plat.c AMD NODE DRIVER +M: Mario Limonciello M: Yazen Ghannam L: linux-kernel@vger.kernel.org S: Supported diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 094c3be81a8d..5e0333534abc 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -21,9 +21,6 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, unsigned long); -int __must_check amd_smn_read(u16 node, u32 address, u32 *value); -int __must_check amd_smn_write(u16 node, u32 address, u32 value); - struct amd_l3_cache { unsigned indices; u8 subcaches[4]; diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h index 419a0ad13ef2..113ad3e8ee40 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd_node.h @@ -30,4 +30,7 @@ static inline u16 amd_num_nodes(void) return topology_amd_nodes_per_pkg() * topology_max_packages(); } +int __must_check amd_smn_read(u16 node, u32 address, u32 *value); +int __must_check amd_smn_write(u16 node, u32 address, u32 value); + #endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index e335d89ddad7..11fac09e3a8c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,9 +15,6 @@ #include #include -/* Protect the PCI config register pairs used for SMN. */ -static DEFINE_MUTEX(smn_mutex); - static u32 *flush_words; static const struct pci_device_id amd_nb_misc_ids[] = { @@ -59,92 +56,6 @@ struct amd_northbridge *node_to_amd_nb(int node) } EXPORT_SYMBOL_GPL(node_to_amd_nb); -/* - * SMN accesses may fail in ways that are difficult to detect here in the called - * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do - * their own checking based on what behavior they expect. - * - * For SMN reads, the returned value may be zero if the register is Read-as-Zero. - * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response" - * can be checked here, and a proper error code can be returned. - * - * But the Read-as-Zero response cannot be verified here. A value of 0 may be - * correct in some cases, so callers must check that this correct is for the - * register/fields they need. - * - * For SMN writes, success can be determined through a "write and read back" - * However, this is not robust when done here. - * - * Possible issues: - * - * 1) Bits that are "Write-1-to-Clear". In this case, the read value should - * *not* match the write value. - * - * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be - * known here. - * - * 3) Bits that are "Reserved / Set to 1". Ditto above. - * - * Callers of amd_smn_write() should do the "write and read back" check - * themselves, if needed. - * - * For #1, they can see if their target bits got cleared. - * - * For #2 and #3, they can check if their target bits got set as intended. - * - * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then - * the operation is considered a success, and the caller does their own - * checking. - */ -static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) -{ - struct pci_dev *root; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - root = node_to_amd_nb(node)->root; - if (!root) - goto out; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(root, 0x60, address); - if (err) { - pr_warn("Error programming SMN address 0x%x.\n", address); - goto out_unlock; - } - - err = (write ? pci_write_config_dword(root, 0x64, *value) - : pci_read_config_dword(root, 0x64, value)); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} - -int __must_check amd_smn_read(u16 node, u32 address, u32 *value) -{ - int err = __amd_smn_rw(node, address, value, false); - - if (PCI_POSSIBLE_ERROR(*value)) { - err = -ENODEV; - *value = 0; - } - - return err; -} -EXPORT_SYMBOL_GPL(amd_smn_read); - -int __must_check amd_smn_write(u16 node, u32 address, u32 value) -{ - return __amd_smn_rw(node, address, &value, true); -} -EXPORT_SYMBOL_GPL(amd_smn_write); - static int amd_cache_northbridges(void) { struct amd_northbridge *nb; diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index 4eea8c7d8090..95e5ca0acc90 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -8,6 +8,7 @@ * Author: Yazen Ghannam */ +#include #include /* @@ -88,3 +89,92 @@ struct pci_dev *amd_node_get_root(u16 node) pci_dbg(root, "is root for AMD node %u\n", node); return root; } + +/* Protect the PCI config register pairs used for SMN. */ +static DEFINE_MUTEX(smn_mutex); + +/* + * SMN accesses may fail in ways that are difficult to detect here in the called + * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do + * their own checking based on what behavior they expect. + * + * For SMN reads, the returned value may be zero if the register is Read-as-Zero. + * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response" + * can be checked here, and a proper error code can be returned. + * + * But the Read-as-Zero response cannot be verified here. A value of 0 may be + * correct in some cases, so callers must check that this correct is for the + * register/fields they need. + * + * For SMN writes, success can be determined through a "write and read back" + * However, this is not robust when done here. + * + * Possible issues: + * + * 1) Bits that are "Write-1-to-Clear". In this case, the read value should + * *not* match the write value. + * + * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be + * known here. + * + * 3) Bits that are "Reserved / Set to 1". Ditto above. + * + * Callers of amd_smn_write() should do the "write and read back" check + * themselves, if needed. + * + * For #1, they can see if their target bits got cleared. + * + * For #2 and #3, they can check if their target bits got set as intended. + * + * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then + * the operation is considered a success, and the caller does their own + * checking. + */ +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_nb_num()) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int __must_check amd_smn_read(u16 node, u32 address, u32 *value) +{ + int err = __amd_smn_rw(node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int __must_check amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 0681ecfe3430..592fb9d97e77 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -828,7 +828,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); #endif -#ifdef CONFIG_AMD_NB +#ifdef CONFIG_AMD_NODE #define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008 #define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 06f7b43a6f78..cb97d7bdae31 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -78,6 +78,7 @@ config EDAC_GHES config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE + depends on AMD_NODE imply AMD_ATL help Support for error detection and correction of DRAM ECC errors on diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ddfbdb66b794..29465088639c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2,6 +2,7 @@ #include #include "amd64_edac.h" #include +#include static struct edac_pci_ctl_info *pci_ctl; diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index dd376602f3f1..ea13ea482a63 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -324,7 +324,7 @@ config SENSORS_K8TEMP config SENSORS_K10TEMP tristate "AMD Family 10h+ temperature sensor" - depends on X86 && PCI && AMD_NB + depends on X86 && PCI && AMD_NODE help If you say yes here you get support for the temperature sensor(s) inside your CPU. Supported are later revisions of diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index cefa8cd184c8..d0b4cc9a5011 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include MODULE_DESCRIPTION("AMD Family 10h+ CPU core temperature monitor"); diff --git a/drivers/platform/x86/amd/pmc/Kconfig b/drivers/platform/x86/amd/pmc/Kconfig index 94f9563d8be7..eeffdafd686e 100644 --- a/drivers/platform/x86/amd/pmc/Kconfig +++ b/drivers/platform/x86/amd/pmc/Kconfig @@ -5,7 +5,7 @@ config AMD_PMC tristate "AMD SoC PMC driver" - depends on ACPI && PCI && RTC_CLASS && AMD_NB + depends on ACPI && PCI && RTC_CLASS && AMD_NODE depends on SUSPEND select SERIO help diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index 26b878ee5191..941b7753dd78 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -10,7 +10,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -28,6 +27,8 @@ #include #include +#include + #include "pmc.h" /* SMU communication registers */ diff --git a/drivers/platform/x86/amd/pmf/Kconfig b/drivers/platform/x86/amd/pmf/Kconfig index 99d67cdbd91e..25b8f7ae3abd 100644 --- a/drivers/platform/x86/amd/pmf/Kconfig +++ b/drivers/platform/x86/amd/pmf/Kconfig @@ -7,7 +7,7 @@ config AMD_PMF tristate "AMD Platform Management Framework" depends on ACPI && PCI depends on POWER_SUPPLY - depends on AMD_NB + depends on AMD_NODE select ACPI_PLATFORM_PROFILE depends on TEE && AMDTEE depends on AMD_SFH_HID diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 06a97c533cb8..7f88f3121cf5 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -8,13 +8,13 @@ * Author: Shyam Sundar S K */ -#include #include #include #include #include #include #include +#include #include "pmf.h" /* PMF-SMU communication registers */ diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig index 551680073e43..6e03942cd7da 100644 --- a/drivers/ras/amd/atl/Kconfig +++ b/drivers/ras/amd/atl/Kconfig @@ -10,6 +10,7 @@ config AMD_ATL tristate "AMD Address Translation Library" depends on AMD_NB && X86_64 && RAS + depends on AMD_NODE depends on MEMORY_FAILURE default N help diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 143d04c779a8..f9be26d25348 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -18,6 +18,7 @@ #include #include +#include #include "reg_fields.h" From 35df797665cb69e68a3a99e499e75e73efbd4f77 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:12:04 +0000 Subject: [PATCH 215/224] x86/amd_node: Update __amd_smn_rw() error paths Use guard(mutex) and convert PCI error codes to common ones. Suggested-by: Tom Lendacky Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-12-yazen.ghannam@amd.com --- arch/x86/kernel/amd_node.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index 95e5ca0acc90..0cca541e18d5 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -136,28 +136,24 @@ static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) int err = -ENODEV; if (node >= amd_nb_num()) - goto out; + return err; root = node_to_amd_nb(node)->root; if (!root) - goto out; + return err; - mutex_lock(&smn_mutex); + guard(mutex)(&smn_mutex); err = pci_write_config_dword(root, 0x60, address); if (err) { pr_warn("Error programming SMN address 0x%x.\n", address); - goto out_unlock; + return pcibios_err_to_errno(err); } err = (write ? pci_write_config_dword(root, 0x64, *value) : pci_read_config_dword(root, 0x64, value)); -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; + return pcibios_err_to_errno(err); } int __must_check amd_smn_read(u16 node, u32 address, u32 *value) From 77466b798d59d6761501ff36094cf430d3876549 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:12:05 +0000 Subject: [PATCH 216/224] x86/amd_node: Remove dependency on AMD_NB Cache the root devices locally so that there are no more dependencies on AMD_NB. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-13-yazen.ghannam@amd.com --- arch/x86/kernel/amd_node.c | 42 +++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index 0cca541e18d5..45077e2e6f2f 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -8,7 +8,6 @@ * Author: Yazen Ghannam */ -#include #include /* @@ -90,6 +89,8 @@ struct pci_dev *amd_node_get_root(u16 node) return root; } +static struct pci_dev **amd_roots; + /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -135,10 +136,10 @@ static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) struct pci_dev *root; int err = -ENODEV; - if (node >= amd_nb_num()) + if (node >= amd_num_nodes()) return err; - root = node_to_amd_nb(node)->root; + root = amd_roots[node]; if (!root) return err; @@ -174,3 +175,38 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value) return __amd_smn_rw(node, address, &value, true); } EXPORT_SYMBOL_GPL(amd_smn_write); + +static int amd_cache_roots(void) +{ + u16 node, num_nodes = amd_num_nodes(); + + amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); + if (!amd_roots) + return -ENOMEM; + + for (node = 0; node < num_nodes; node++) + amd_roots[node] = amd_node_get_root(node); + + return 0; +} + +static int __init amd_smn_init(void) +{ + int err; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + guard(mutex)(&smn_mutex); + + if (amd_roots) + return 0; + + err = amd_cache_roots(); + if (err) + return err; + + return 0; +} + +fs_initcall(amd_smn_init); From 79821b907f8d7fbc991554fc940075dc1b29a0f4 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 6 Dec 2024 16:12:06 +0000 Subject: [PATCH 217/224] x86/amd_node: Use defines for SMN register offsets There are more than one SMN index/data pair available for software use. The register offsets are different, but the protocol is the same. Use defines for the SMN offset values and allow the index/data offsets to be passed to the read/write helper function. This eases code reuse with other SMN users in the kernel. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20241206161210.163701-14-yazen.ghannam@amd.com --- arch/x86/kernel/amd_node.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index 45077e2e6f2f..d2ec7fd555c5 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -94,6 +94,9 @@ static struct pci_dev **amd_roots; /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); +#define SMN_INDEX_OFFSET 0x60 +#define SMN_DATA_OFFSET 0x64 + /* * SMN accesses may fail in ways that are difficult to detect here in the called * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do @@ -131,7 +134,7 @@ static DEFINE_MUTEX(smn_mutex); * the operation is considered a success, and the caller does their own * checking. */ -static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write) { struct pci_dev *root; int err = -ENODEV; @@ -145,21 +148,21 @@ static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) guard(mutex)(&smn_mutex); - err = pci_write_config_dword(root, 0x60, address); + err = pci_write_config_dword(root, i_off, address); if (err) { pr_warn("Error programming SMN address 0x%x.\n", address); return pcibios_err_to_errno(err); } - err = (write ? pci_write_config_dword(root, 0x64, *value) - : pci_read_config_dword(root, 0x64, value)); + err = (write ? pci_write_config_dword(root, d_off, *value) + : pci_read_config_dword(root, d_off, value)); return pcibios_err_to_errno(err); } int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { - int err = __amd_smn_rw(node, address, value, false); + int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false); if (PCI_POSSIBLE_ERROR(*value)) { err = -ENODEV; @@ -172,7 +175,7 @@ EXPORT_SYMBOL_GPL(amd_smn_read); int __must_check amd_smn_write(u16 node, u32 address, u32 value) { - return __amd_smn_rw(node, address, &value, true); + return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true); } EXPORT_SYMBOL_GPL(amd_smn_write); From 73bbf3b0fbba9aa27fef07a1fbd837661a863f03 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:30 +0530 Subject: [PATCH 218/224] x86/tsc: Init the TSC for Secure TSC guests Use the GUEST_TSC_FREQ MSR to discover the TSC frequency instead of relying on kvm-clock based frequency calibration. Override both CPU and TSC frequency calibration callbacks with securetsc_get_tsc_khz(). Since the difference between CPU base and TSC frequency does not apply in this case, the same callback is being used. [ bp: Carve out from https://lore.kernel.org/r/20250106124633.1418972-11-nikunj@amd.com ] Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-11-nikunj@amd.com --- arch/x86/coco/sev/core.c | 21 +++++++++++++++++++++ arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/tsc.c | 4 ++++ 3 files changed, 27 insertions(+) diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 106bdeda58c5..65d676c0f7bc 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -103,6 +103,7 @@ static u64 secrets_pa __ro_after_init; */ static u64 snp_tsc_scale __ro_after_init; static u64 snp_tsc_offset __ro_after_init; +static u64 snp_tsc_freq_khz __ro_after_init; /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { @@ -3278,3 +3279,23 @@ void __init snp_secure_tsc_prepare(void) pr_debug("SecureTSC enabled"); } + +static unsigned long securetsc_get_tsc_khz(void) +{ + return snp_tsc_freq_khz; +} + +void __init snp_secure_tsc_init(void) +{ + unsigned long long tsc_freq_mhz; + + if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) + return; + + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz); + snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000); + + x86_platform.calibrate_cpu = securetsc_get_tsc_khz; + x86_platform.calibrate_tsc = securetsc_get_tsc_khz; +} diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index bdcdaac4df1c..5d9685f92e5c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -482,6 +482,7 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req struct snp_guest_request_ioctl *rio); void __init snp_secure_tsc_prepare(void); +void __init snp_secure_tsc_init(void); #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -524,6 +525,7 @@ static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, struct snp_guest_request_ioctl *rio) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } +static inline void __init snp_secure_tsc_init(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 67aeaba4ba9c..0864b314c26a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -30,6 +30,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1515,6 +1516,9 @@ void __init tsc_early_init(void) /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; + + snp_secure_tsc_init(); + if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); From 0563ee35ae2c9cfb0c6a7b2c0ddf7d9372bb8a98 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:33 +0530 Subject: [PATCH 219/224] x86/sev: Add the Secure TSC feature for SNP guests Now that all the required plumbing is done for enabling Secure TSC, add it to the SNP features present list. Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Peter Gonda Link: https://lore.kernel.org/r/20250106124633.1418972-14-nikunj@amd.com --- arch/x86/boot/compressed/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index cd44e120fe53..bb55934c1cee 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -401,7 +401,8 @@ finish: * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ -#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP +#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_SECURE_TSC) u64 snp_get_unsupported_features(u64 status) { From a937f384c9da493e526ad896ef4e8054526d2941 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Jan 2025 11:26:48 +0100 Subject: [PATCH 220/224] cleanup, tags: Create tags for the cleanup primitives Oleg reported that it is hard to find the definition of things like: __free(argv) without having to do 'git grep "DEFINE_FREE(argv,"'. Add tag generation for the various macros in cleanup.h. Notably 'DEFINE_FREE(argv, ...)' will now generate a 'cleanup_argv' tag, while all the others, eg. 'DEFINE_GUARD(mutex, ...)' will generate 'class_mutex' like tags. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250106102647.GB20870@noisy.programming.kicks-ass.net --- scripts/tags.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/tags.sh b/scripts/tags.sh index b21236377998..7939aea731f1 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -212,6 +212,13 @@ regex_c=( '/^SEQCOUNT_LOCKTYPE(\([^,]*\),[[:space:]]*\([^,]*\),[^)]*)/seqcount_\2_init/' '/^\ Date: Thu, 9 Jan 2025 12:47:03 +0100 Subject: [PATCH 221/224] MAINTAINERS: Add static_call_inline.c to STATIC BRANCH/CALL Commit 8fd4ddda2f49 ("static_call: Don't make __static_call_return0 static") split static_call.c and created static_call_inline.c. This was not reflected in MAINTAINERS. Fix it by changing the MAINTAINERS line to be a glob: static_call*.c. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250109114703.426577-1-jirislaby@kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9bcd4e72a2dc..7da973afe26b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22450,7 +22450,7 @@ F: arch/*/kernel/static_call.c F: include/linux/jump_label*.h F: include/linux/static_call*.h F: kernel/jump_label.c -F: kernel/static_call.c +F: kernel/static_call*.c STI AUDIO (ASoC) DRIVERS M: Arnaud Pouliquen From 3f710be02ea648001ba18fb2c9fa7765e743dec2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:16 -0800 Subject: [PATCH 222/224] perf/x86/intel/uncore: Clean up func_id The below warning may be triggered on GNR when the PCIE uncore units are exposed. WARNING: CPU: 4 PID: 1 at arch/x86/events/intel/uncore.c:1169 uncore_pci_pmu_register+0x158/0x190 The current uncore driver assumes that all the devices in the same PMU have the exact same devfn. It's true for the previous platforms. But it doesn't work for the new PCIE uncore units on GNR. The assumption doesn't make sense. There is no reason to limit the devices from the same PMU to the same devfn. Also, the current code just throws the warning, but still registers the device. The WARN_ON_ONCE() should be removed. The func_id is used by the later event_init() to check if a event->pmu has valid devices. For cpu and mmio uncore PMUs, they are always valid. For pci uncore PMUs, it's set when the PMU is registered. It can be replaced by the pmu->registered. Clean up the func_id. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 20 +++++++------------- arch/x86/events/intel/uncore.h | 1 - arch/x86/events/intel/uncore_snb.c | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d98fac567684..24372cf7fdfb 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ @@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types) uncore_type_exit(*types); } -static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) +static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; size_t size; @@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = setid ? i : -1; pmus[i].pmu_idx = i; pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); @@ -1055,12 +1054,12 @@ err: } static int __init -uncore_types_init(struct intel_uncore_type **types, bool setid) +uncore_types_init(struct intel_uncore_type **types) { int ret; for (; *types; types++) { - ret = uncore_type_init(*types, setid); + ret = uncore_type_init(*types); if (ret) return ret; } @@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, if (!box) return -ENOMEM; - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - atomic_inc(&box->refcnt); box->dieid = die; box->pci_dev = pdev; @@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void) goto err; } - ret = uncore_types_init(uncore_pci_uncores, false); + ret = uncore_types_init(uncore_pci_uncores); if (ret) goto errtype; @@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void) { int ret; - ret = uncore_types_init(uncore_msr_uncores, true); + ret = uncore_types_init(uncore_msr_uncores); if (ret) goto err; @@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void) struct intel_uncore_type **types = uncore_mmio_uncores; int ret; - ret = uncore_types_init(types, true); + ret = uncore_types_init(types); if (ret) goto err; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79ff32e13dcc..3dcb88c0ecfa 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -125,7 +125,6 @@ struct intel_uncore_pmu { struct pmu pmu; char name[UNCORE_PMU_NAME_LEN]; int pmu_idx; - int func_id; bool registered; atomic_t activeboxes; cpumask_t cpu_mask; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3934e1e4e3b1..edb7fd50efe0 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ From 6d642735cdb6cdb814d2b6c81652caa53ce04842 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:17 -0800 Subject: [PATCH 223/224] perf/x86/intel/uncore: Support more units on Granite Rapids The same CXL PMONs support is also avaiable on GNR. Apply spr_uncore_cxlcm and spr_uncore_cxldp to GNR as well. The other units were broken on early HW samples, so they were ignored in the early enabling patch. The issue has been fixed and verified on the later production HW. Add UPI, B2UPI, B2HOT, PCIEX16 and PCIEX8 for GNR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-2-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 48 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ca98744343b8..60973c209c0e 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void) /* GNR uncore support */ #define UNCORE_GNR_NUM_UNCORE_TYPES 23 -#define UNCORE_GNR_TYPE_15 15 -#define UNCORE_GNR_B2UPI 18 -#define UNCORE_GNR_TYPE_21 21 -#define UNCORE_GNR_TYPE_22 22 int gnr_uncore_units_ignore[] = { - UNCORE_SPR_UPI, - UNCORE_GNR_TYPE_15, - UNCORE_GNR_B2UPI, - UNCORE_GNR_TYPE_21, - UNCORE_GNR_TYPE_22, UNCORE_IGNORE_END }; @@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type gnr_uncore_pciex8 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex8", +}; + +static struct intel_uncore_type gnr_uncore_pciex16 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex16", +}; + +static struct intel_uncore_type gnr_uncore_upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "upi", +}; + +static struct intel_uncore_type gnr_uncore_b2upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2upi", +}; + +static struct intel_uncore_type gnr_uncore_b2hot = { + .name = "b2hot", + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type gnr_uncore_b2cmi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "b2cmi", @@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { &gnr_uncore_ubox, &spr_uncore_imc, NULL, + &gnr_uncore_upi, NULL, NULL, NULL, + &spr_uncore_cxlcm, + &spr_uncore_cxldp, NULL, - NULL, - NULL, - NULL, - NULL, + &gnr_uncore_b2hot, &gnr_uncore_b2cmi, &gnr_uncore_b2cxl, - NULL, + &gnr_uncore_b2upi, NULL, &gnr_uncore_mdf_sbo, - NULL, - NULL, + &gnr_uncore_pciex16, + &gnr_uncore_pciex8, }; static struct freerunning_counters gnr_iio_freerunning[] = { From b709eb872e19a19607bbb6d2975bc264d59735cf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 3 Jan 2025 15:31:51 +0000 Subject: [PATCH 224/224] perf: map pages in advance We are adjusting struct page to make it smaller, removing unneeded fields which correctly belong to struct folio. Two of those fields are page->index and page->mapping. Perf is currently making use of both of these. This is unnecessary. This patch eliminates this. Perf establishes its own internally controlled memory-mapped pages using vm_ops hooks. The first page in the mapping is the read/write user control page, and the rest of the mapping consists of read-only pages. The VMA is backed by kernel memory either from the buddy allocator or vmalloc depending on configuration. It is intended to be mapped read/write, but because it has a page_mkwrite() hook, vma_wants_writenotify() indicates that it should be mapped read-only. When a write fault occurs, the provided page_mkwrite() hook, perf_mmap_fault() (doing double duty handing faults as well) uses the vmf->pgoff field to determine if this is the first page, allowing for the desired read/write first page, read-only rest mapping. For this to work the implementation has to carefully work around faulting logic. When a page is write-faulted, the fault() hook is called first, then its page_mkwrite() hook is called (to allow for dirty tracking in file systems). On fault we set the folio's mapping in perf_mmap_fault(), this is because when do_page_mkwrite() is subsequently invoked, it treats a missing mapping as an indicator that the fault should be retried. We also set the folio's index so, given the folio is being treated as faux user memory, it correctly references its offset within the VMA. This explains why the mapping and index fields are used - but it's not necessary. We preallocate pages when perf_mmap() is called for the first time via rb_alloc(), and further allocate auxiliary pages via rb_aux_alloc() as needed if the mapping requires it. This allocation is done in the f_ops->mmap() hook provided in perf_mmap(), and so we can instead simply map all the memory right away here - there's no point in handling (read) page faults when we don't demand page nor need to be notified about them (perf does not). This patch therefore changes this logic to map everything when the mmap() hook is called, establishing a PFN map. It implements vm_ops->pfn_mkwrite() to provide the required read/write vs. read-only behaviour, which does not require the previously implemented workarounds. While it is not ideal to use a VM_PFNMAP here, doing anything else will result in the page_mkwrite() hook need to be provided, which requires the same page->mapping hack this patch seeks to undo. It will also result in the pages being treated as folios and placed on the rmap, which really does not make sense for these mappings. Semantically it makes sense to establish this as some kind of special mapping, as the pages are managed by perf and are not strictly user pages, but currently the only means by which we can do so functionally while maintaining the required R/W and R/O behaviour is a PFN map. There should be no change to actual functionality as a result of this change. Signed-off-by: Lorenzo Stoakes Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250103153151.124163-1-lorenzo.stoakes@oracle.com --- kernel/events/core.c | 118 +++++++++++++++++++++++++----------- kernel/events/ring_buffer.c | 19 +----- 2 files changed, 82 insertions(+), 55 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index b2bc67791f84..bcb09e011e9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6277,41 +6277,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { @@ -6551,13 +6516,87 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -6682,6 +6721,8 @@ again: goto again; } + /* We need the rb to map pages. */ + rb = event->rb; goto unlock; } @@ -6776,6 +6817,9 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; + if (!ret) + ret = map_range(rb, vma); + if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4f46f688d0d4..180509132d4b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx) struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); - page->mapping = NULL; __free_page(page); } @@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); - page->mapping = NULL; __free_page(page); } @@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; - void *base; - int i, nr; rb = container_of(work, struct perf_buffer, work); - nr = data_page_nr(rb); - base = rb->user_page; - /* The '<=' counts in the user page. */ - for (i = 0; i <= nr; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); + vfree(rb->user_page); kfree(rb); }