From e77ab93e0e042f2fc1cad77255a3365c9c803362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 23:26:49 +0900 Subject: [PATCH 01/56] kprobes: Reduce preempt disable scope in check_kprobe_access_safe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a189d0350f387 ("kprobes: disable preempt for module_text_address() and kernel_text_address()") introduced a preempt_disable() region to protect against concurrent module unloading. However this region also includes the call to jump_label_text_reserved() which takes a long time; up to 400us, iterating over approx 6000 jump tables. The scope protected by preempt_disable() is largen than necessary. core_kernel_text() does not need to be protected as it does not interact with module code at all. Only the scope from __module_text_address() to try_module_get() needs to be protected. By limiting the critical section to __module_text_address() and try_module_get() the function responsible for the latency spike remains preemptible. This works fine even when !CONFIG_MODULES as in that case try_module_get() will always return true and that block can be optimized away. Limit the critical section to __module_text_address() and try_module_get(). Use guard(preempt)() for easier error handling. While at it also remove a spurious *probed_mod = NULL in an error path. On errors the output parameter is never inspected by the caller. Some error paths were clearing the parameters, some didn't. Align them for clarity. Link: https://lore.kernel.org/all/20241121-kprobes-preempt-v1-1-fd581ee7fcbb@linutronix.de/ Signed-off-by: Thomas Weißschuh Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b027a4030976..cb9dbdafbbcf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -1566,16 +1567,25 @@ static int check_kprobe_address_safe(struct kprobe *p, if (ret) return ret; jump_label_lock(); - preempt_disable(); /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { + guard(preempt)(); *probed_mod = __module_text_address((unsigned long) p->addr); if (!(*probed_mod)) { ret = -EINVAL; goto out; } + + /* + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. + */ + if (unlikely(!try_module_get(*probed_mod))) { + ret = -ENOENT; + goto out; + } } /* Ensure it is not in reserved area. */ if (in_gate_area_no_mm((unsigned long) p->addr) || @@ -1584,21 +1594,13 @@ static int check_kprobe_address_safe(struct kprobe *p, static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr) || is_cfi_preamble_symbol((unsigned long)p->addr)) { + module_put(*probed_mod); ret = -EINVAL; goto out; } /* Get module refcount and reject __init functions for loaded modules. */ if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { - /* - * We must hold a refcount of the probed module while updating - * its code to prohibit unexpected unloading. - */ - if (unlikely(!try_module_get(*probed_mod))) { - ret = -ENOENT; - goto out; - } - /* * If the module freed '.init.text', we couldn't insert * kprobes in there. @@ -1606,13 +1608,11 @@ static int check_kprobe_address_safe(struct kprobe *p, if (within_module_init((unsigned long)p->addr, *probed_mod) && !module_is_coming(*probed_mod)) { module_put(*probed_mod); - *probed_mod = NULL; ret = -ENOENT; } } out: - preempt_enable(); jump_label_unlock(); return ret; From b6b68a3e64b368c5ec788109f6a25b312478e552 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 30 Nov 2024 01:47:58 +0900 Subject: [PATCH 02/56] kprobes: Adopt guard() and scoped_guard() Use guard() or scoped_guard() for critical sections rather than discrete lock/unlock pairs. Link: https://lore.kernel.org/all/173289887835.73724.608223217359025939.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 215 +++++++++++++++++++++-------------------------- 1 file changed, 94 insertions(+), 121 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cb9dbdafbbcf..62b5b08d809d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -141,10 +141,9 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c); kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; - kprobe_opcode_t *slot = NULL; /* Since the slot array is not protected by rcu, we need a mutex */ - mutex_lock(&c->mutex); + guard(mutex)(&c->mutex); retry: rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { @@ -155,9 +154,8 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - slot = kip->insns + (i * c->insn_size); rcu_read_unlock(); - goto out; + return kip->insns + (i * c->insn_size); } } /* kip->nused is broken. Fix it. */ @@ -174,12 +172,12 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) /* All out of space. Need to allocate a new page. */ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL); if (!kip) - goto out; + return NULL; kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); - goto out; + return NULL; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); @@ -188,14 +186,12 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->ngarbage = 0; kip->cache = c; list_add_rcu(&kip->list, &c->pages); - slot = kip->insns; /* Record the perf ksymbol register event after adding the page */ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, PAGE_SIZE, false, c->sym); -out: - mutex_unlock(&c->mutex); - return slot; + + return kip->insns; } /* Return true if all garbages are collected, otherwise false. */ @@ -256,7 +252,7 @@ void __free_insn_slot(struct kprobe_insn_cache *c, struct kprobe_insn_page *kip; long idx; - mutex_lock(&c->mutex); + guard(mutex)(&c->mutex); rcu_read_lock(); list_for_each_entry_rcu(kip, &c->pages, list) { idx = ((long)slot - (long)kip->insns) / @@ -282,7 +278,6 @@ out: collect_one_slot(kip, idx); } } - mutex_unlock(&c->mutex); } /* @@ -638,10 +633,9 @@ static void kprobe_optimizer(struct work_struct *work) mutex_unlock(&kprobe_mutex); } -/* Wait for completing optimization and unoptimization */ -void wait_for_kprobe_optimizer(void) +static void wait_for_kprobe_optimizer_locked(void) { - mutex_lock(&kprobe_mutex); + lockdep_assert_held(&kprobe_mutex); while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); @@ -653,8 +647,14 @@ void wait_for_kprobe_optimizer(void) mutex_lock(&kprobe_mutex); } +} - mutex_unlock(&kprobe_mutex); +/* Wait for completing optimization and unoptimization */ +void wait_for_kprobe_optimizer(void) +{ + guard(mutex)(&kprobe_mutex); + + wait_for_kprobe_optimizer_locked(); } bool optprobe_queued_unopt(struct optimized_kprobe *op) @@ -884,10 +884,10 @@ static void optimize_all_kprobes(void) struct kprobe *p; unsigned int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If optimization is already allowed, just return. */ if (kprobes_allow_optimization) - goto out; + return; cpus_read_lock(); kprobes_allow_optimization = true; @@ -899,8 +899,6 @@ static void optimize_all_kprobes(void) } cpus_read_unlock(); pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n"); -out: - mutex_unlock(&kprobe_mutex); } #ifdef CONFIG_SYSCTL @@ -910,12 +908,10 @@ static void unoptimize_all_kprobes(void) struct kprobe *p; unsigned int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If optimization is already prohibited, just return. */ - if (!kprobes_allow_optimization) { - mutex_unlock(&kprobe_mutex); + if (!kprobes_allow_optimization) return; - } cpus_read_lock(); kprobes_allow_optimization = false; @@ -927,10 +923,8 @@ static void unoptimize_all_kprobes(void) } } cpus_read_unlock(); - mutex_unlock(&kprobe_mutex); - /* Wait for unoptimizing completion. */ - wait_for_kprobe_optimizer(); + wait_for_kprobe_optimizer_locked(); pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } @@ -942,7 +936,7 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table, { int ret; - mutex_lock(&kprobe_sysctl_mutex); + guard(mutex)(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -950,7 +944,6 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_sysctl_mutex); return ret; } @@ -1025,7 +1018,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) #define __arm_kprobe(p) arch_arm_kprobe(p) #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) #define kprobe_disarmed(p) kprobe_disabled(p) -#define wait_for_kprobe_optimizer() do {} while (0) +#define wait_for_kprobe_optimizer_locked() \ + lockdep_assert_held(&kprobe_mutex) static int reuse_unused_kprobe(struct kprobe *ap) { @@ -1489,6 +1483,7 @@ invalid: static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { bool on_func_entry; + return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); } @@ -1523,14 +1518,12 @@ valid: */ static inline int warn_kprobe_rereg(struct kprobe *p) { - int ret = 0; + guard(mutex)(&kprobe_mutex); - mutex_lock(&kprobe_mutex); if (WARN_ON_ONCE(__get_valid_kprobe(p))) - ret = -EINVAL; - mutex_unlock(&kprobe_mutex); + return -EINVAL; - return ret; + return 0; } static int check_ftrace_location(struct kprobe *p) @@ -1618,15 +1611,52 @@ out: return ret; } -int register_kprobe(struct kprobe *p) +static int __register_kprobe(struct kprobe *p) { int ret; struct kprobe *old_p; + + guard(mutex)(&kprobe_mutex); + + old_p = get_kprobe(p->addr); + if (old_p) + /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ + return register_aggr_kprobe(old_p, p); + + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); + ret = prepare_kprobe(p); + mutex_unlock(&text_mutex); + cpus_read_unlock(); + if (ret) + return ret; + + INIT_HLIST_NODE(&p->hlist); + hlist_add_head_rcu(&p->hlist, + &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_rcu(); + } + } + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); + return 0; +} + +int register_kprobe(struct kprobe *p) +{ + int ret; struct module *probed_mod; kprobe_opcode_t *addr; bool on_func_entry; - /* Adjust probe address from symbol */ + /* Canonicalize probe address from symbol */ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); @@ -1638,6 +1668,8 @@ int register_kprobe(struct kprobe *p) /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -1645,44 +1677,7 @@ int register_kprobe(struct kprobe *p) if (ret) return ret; - mutex_lock(&kprobe_mutex); - - if (on_func_entry) - p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; - - old_p = get_kprobe(p->addr); - if (old_p) { - /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ - ret = register_aggr_kprobe(old_p, p); - goto out; - } - - cpus_read_lock(); - /* Prevent text modification */ - mutex_lock(&text_mutex); - ret = prepare_kprobe(p); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - if (ret) - goto out; - - INIT_HLIST_NODE(&p->hlist); - hlist_add_head_rcu(&p->hlist, - &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - - if (!kprobes_all_disarmed && !kprobe_disabled(p)) { - ret = arm_kprobe(p); - if (ret) { - hlist_del_rcu(&p->hlist); - synchronize_rcu(); - goto out; - } - } - - /* Try to optimize kprobe */ - try_to_optimize_kprobe(p); -out: - mutex_unlock(&kprobe_mutex); + ret = __register_kprobe(p); if (probed_mod) module_put(probed_mod); @@ -1858,12 +1853,11 @@ void unregister_kprobes(struct kprobe **kps, int num) if (num <= 0) return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(kps[i]) < 0) - kps[i]->addr = NULL; - mutex_unlock(&kprobe_mutex); - + scoped_guard(mutex, &kprobe_mutex) { + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + } synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) @@ -2302,8 +2296,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num) if (num <= 0) return; - mutex_lock(&kprobe_mutex); for (i = 0; i < num; i++) { + guard(mutex)(&kprobe_mutex); + if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; #ifdef CONFIG_KRETPROBE_ON_RETHOOK @@ -2312,7 +2307,6 @@ void unregister_kretprobes(struct kretprobe **rps, int num) rcu_assign_pointer(rps[i]->rph->rp, NULL); #endif } - mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) { @@ -2393,18 +2387,14 @@ static void kill_kprobe(struct kprobe *p) /* Disable one kprobe */ int disable_kprobe(struct kprobe *kp) { - int ret = 0; struct kprobe *p; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Disable this kprobe */ p = __disable_kprobe(kp); - if (IS_ERR(p)) - ret = PTR_ERR(p); - mutex_unlock(&kprobe_mutex); - return ret; + return IS_ERR(p) ? PTR_ERR(p) : 0; } EXPORT_SYMBOL_GPL(disable_kprobe); @@ -2414,20 +2404,16 @@ int enable_kprobe(struct kprobe *kp) int ret = 0; struct kprobe *p; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Check whether specified probe is valid. */ p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } + if (unlikely(p == NULL)) + return -EINVAL; - if (kprobe_gone(kp)) { + if (kprobe_gone(kp)) /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } + return -EINVAL; if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; @@ -2441,8 +2427,6 @@ int enable_kprobe(struct kprobe *kp) kp->flags |= KPROBE_FLAG_DISABLED; } } -out: - mutex_unlock(&kprobe_mutex); return ret; } EXPORT_SYMBOL_GPL(enable_kprobe); @@ -2630,11 +2614,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) { - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); + + if (val == MODULE_STATE_COMING) add_module_kprobe_blacklist(mod); - mutex_unlock(&kprobe_mutex); - } + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2644,7 +2628,6 @@ static int kprobes_module_callback(struct notifier_block *nb, * notified, only '.init.text' section would be freed. We need to * disable kprobes which have been inserted in the sections. */ - mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) @@ -2667,7 +2650,6 @@ static int kprobes_module_callback(struct notifier_block *nb, } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); - mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2695,7 +2677,7 @@ void kprobe_free_init_mem(void) struct kprobe *p; int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Kill all kprobes on initmem because the target code has been freed. */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -2705,8 +2687,6 @@ void kprobe_free_init_mem(void) kill_kprobe(p); } } - - mutex_unlock(&kprobe_mutex); } static int __init init_kprobes(void) @@ -2902,11 +2882,11 @@ static int arm_all_kprobes(void) unsigned int i, total = 0, errors = 0; int err, ret = 0; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If kprobes are armed, just return */ if (!kprobes_all_disarmed) - goto already_enabled; + return 0; /* * optimize_kprobe() called by arm_kprobe() checks @@ -2936,8 +2916,6 @@ static int arm_all_kprobes(void) else pr_info("Kprobes globally enabled\n"); -already_enabled: - mutex_unlock(&kprobe_mutex); return ret; } @@ -2948,13 +2926,11 @@ static int disarm_all_kprobes(void) unsigned int i, total = 0, errors = 0; int err, ret = 0; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If kprobes are already disarmed, just return */ - if (kprobes_all_disarmed) { - mutex_unlock(&kprobe_mutex); + if (kprobes_all_disarmed) return 0; - } kprobes_all_disarmed = true; @@ -2979,11 +2955,8 @@ static int disarm_all_kprobes(void) else pr_info("Kprobes globally disabled\n"); - mutex_unlock(&kprobe_mutex); - /* Wait for disarming all kprobes by optimizer */ - wait_for_kprobe_optimizer(); - + wait_for_kprobe_optimizer_locked(); return ret; } From 93bc1de21e68bad7d0b0bdbd26998c714e9bf96b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 30 Nov 2024 01:48:08 +0900 Subject: [PATCH 03/56] tracing/kprobe: Adopt guard() and scoped_guard() Use guard() or scoped_guard() in kprobe events for critical sections rather than discrete lock/unlock pairs. Link: https://lore.kernel.org/all/173289888883.73724.6586200652276577583.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 263fac44d3ca..bae26eb14449 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -634,7 +634,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); @@ -642,11 +642,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_kprobe(tk, old_tk); + return -EEXIST; } - goto end; + return append_trace_kprobe(tk, old_tk); } /* Register new event */ @@ -657,7 +655,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register k*probe */ @@ -672,8 +670,6 @@ static int register_trace_kprobe(struct trace_kprobe *tk) else dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -706,7 +702,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ @@ -718,7 +714,6 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, module_name(mod), ret); } } - mutex_unlock(&event_mutex); return NOTIFY_DONE; } @@ -1968,13 +1963,12 @@ static __init void enable_boot_kprobe_events(void) struct trace_kprobe *tk; struct dyn_event *pos; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { list_for_each_entry(file, &tr->events, list) if (file->event_call == trace_probe_event_call(&tk->tp)) trace_event_enable_disable(file, 1, 0); } - mutex_unlock(&event_mutex); } static __init void setup_boot_kprobe_events(void) From c776572f1ec6e8817261c4623220f5282483370e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 30 Nov 2024 01:48:19 +0900 Subject: [PATCH 04/56] tracing/uprobe: Adopt guard() and scoped_guard() Use guard() or scoped_guard() in uprobe events for critical sections rather than discrete lock/unlock pairs. Link: https://lore.kernel.org/all/173289889911.73724.12457932738419630525.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_uprobe.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fed382b7881b..e91a4248d97b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -498,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = validate_ref_ctr_offset(tu); if (ret) - goto end; + return ret; /* register as an event */ old_tu = find_probe_event(trace_probe_name(&tu->tp), @@ -511,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu) if (is_ret_probe(tu) != is_ret_probe(old_tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_uprobe(tu, old_tu); + return -EEXIST; } - goto end; + return append_trace_uprobe(tu, old_tu); } ret = register_uprobe_event(tu); @@ -525,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); -end: - mutex_unlock(&event_mutex); - return ret; } From 36364aa071c14ffa187d8ef92c090cd61b18af3d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 30 Nov 2024 01:48:30 +0900 Subject: [PATCH 05/56] tracing/eprobe: Adopt guard() and scoped_guard() Use guard() or scoped_guard() in eprobe events for critical sections rather than discrete lock/unlock pairs. Link: https://lore.kernel.org/all/173289890996.73724.17421347964110362029.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index be8be0c1aaf0..82fd637cfc19 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -917,10 +917,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } - mutex_lock(&event_mutex); - event_call = find_and_get_event(sys_name, sys_event); - ep = alloc_event_probe(group, event, event_call, argc - 2); - mutex_unlock(&event_mutex); + scoped_guard(mutex, &event_mutex) { + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + } if (IS_ERR(ep)) { ret = PTR_ERR(ep); @@ -952,23 +952,21 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (ret < 0) goto error; init_trace_eprobe_call(ep); - mutex_lock(&event_mutex); - ret = trace_probe_register_event_call(&ep->tp); - if (ret) { - if (ret == -EEXIST) { - trace_probe_log_set_index(0); - trace_probe_log_err(0, EVENT_EXIST); + scoped_guard(mutex, &event_mutex) { + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + goto error; } - mutex_unlock(&event_mutex); - goto error; } - ret = dyn_event_add(&ep->devent, &ep->tp.event->call); - if (ret < 0) { - trace_probe_unregister_event_call(&ep->tp); - mutex_unlock(&event_mutex); - goto error; - } - mutex_unlock(&event_mutex); return ret; parse_error: ret = -EINVAL; From 7d137e604aaacf2723ced2fca0b46b3f563e1d5e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Dec 2024 13:46:19 -0500 Subject: [PATCH 06/56] fgraph: Remove unnecessary disabling of interrupts and recursion The function graph tracer disables interrupts as well as prevents recursion via NMIs when recording the graph tracer code. There's no reason to do this today. That disabling goes back to 2008 when the function graph tracer was first introduced and recursion protection wasn't part of the code. Today, there's no reason to disable interrupts or prevent the code from recursing as the infrastructure can easily handle it. Before this change: ~# echo function_graph > /sys/kernel/tracing/current_tracer ~# perf stat -r 10 ./hackbench 10 Time: 4.240 Time: 4.236 Time: 4.106 Time: 4.014 Time: 4.314 Time: 3.830 Time: 4.063 Time: 4.323 Time: 3.763 Time: 3.727 Performance counter stats for '/work/c/hackbench 10' (10 runs): 33,937.20 msec task-clock # 7.008 CPUs utilized ( +- 1.85% ) 18,220 context-switches # 536.874 /sec ( +- 6.41% ) 624 cpu-migrations # 18.387 /sec ( +- 9.07% ) 11,319 page-faults # 333.528 /sec ( +- 1.97% ) 76,657,643,617 cycles # 2.259 GHz ( +- 0.40% ) 141,403,302,768 instructions # 1.84 insn per cycle ( +- 0.37% ) 25,518,463,888 branches # 751.932 M/sec ( +- 0.35% ) 156,151,050 branch-misses # 0.61% of all branches ( +- 0.63% ) 4.8423 +- 0.0892 seconds time elapsed ( +- 1.84% ) After this change: ~# echo function_graph > /sys/kernel/tracing/current_tracer ~# perf stat -r 10 ./hackbench 10 Time: 3.340 Time: 3.192 Time: 3.129 Time: 2.579 Time: 2.589 Time: 2.798 Time: 2.791 Time: 2.955 Time: 3.044 Time: 3.065 Performance counter stats for './hackbench 10' (10 runs): 24,416.30 msec task-clock # 6.996 CPUs utilized ( +- 2.74% ) 16,764 context-switches # 686.590 /sec ( +- 5.85% ) 469 cpu-migrations # 19.208 /sec ( +- 6.14% ) 11,519 page-faults # 471.775 /sec ( +- 1.92% ) 53,895,628,450 cycles # 2.207 GHz ( +- 0.52% ) 105,552,664,638 instructions # 1.96 insn per cycle ( +- 0.47% ) 17,808,672,667 branches # 729.376 M/sec ( +- 0.48% ) 133,075,435 branch-misses # 0.75% of all branches ( +- 0.59% ) 3.490 +- 0.112 seconds time elapsed ( +- 3.22% ) Also removed unneeded "unlikely()" around the retaddr code. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20241223184941.204074053@goodmis.org Fixes: 9cd2992f2d6c8 ("fgraph: Have set_graph_notrace only affect function_graph tracer") # Performance only Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 37 +++++++++++----------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 5504b5e4e7b4..f513603d7df9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -181,10 +181,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; long disabled; - int ret; + int ret = 0; int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) @@ -235,25 +234,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) { + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else + } else { ret = __trace_graph_entry(tr, trace, trace_ctx); - } else { - ret = 0; + } } - - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); return ret; } @@ -320,7 +315,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace, struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; long disabled; int size; @@ -341,16 +335,15 @@ void trace_graph_return(struct ftrace_graph_ret *trace, trace->calltime = ftimes->calltime; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); __trace_graph_return(tr, trace, trace_ctx); } - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, From ac8c3b02fc33be9deda48532326d301333d39f16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Dec 2024 13:46:20 -0500 Subject: [PATCH 07/56] ftrace: Do not disable interrupts in profiler The function profiler disables interrupts before processing. This was there since the profiler was introduced back in 2009 when there were recursion issues to deal with. The function tracer is much more robust today and has its own internal recursion protection. There's no reason to disable interrupts in the function profiler. Instead, just disable preemption and use the guard() infrastructure while at it. Before this change: ~# echo 1 > /sys/kernel/tracing/function_profile_enabled ~# perf stat -r 10 ./hackbench 10 Time: 3.099 Time: 2.556 Time: 2.500 Time: 2.705 Time: 2.985 Time: 2.959 Time: 2.859 Time: 2.621 Time: 2.742 Time: 2.631 Performance counter stats for '/work/c/hackbench 10' (10 runs): 23,156.77 msec task-clock # 6.951 CPUs utilized ( +- 2.36% ) 18,306 context-switches # 790.525 /sec ( +- 5.95% ) 495 cpu-migrations # 21.376 /sec ( +- 8.61% ) 11,522 page-faults # 497.565 /sec ( +- 1.80% ) 47,967,124,606 cycles # 2.071 GHz ( +- 0.41% ) 80,009,078,371 instructions # 1.67 insn per cycle ( +- 0.34% ) 16,389,249,798 branches # 707.752 M/sec ( +- 0.36% ) 139,943,109 branch-misses # 0.85% of all branches ( +- 0.61% ) 3.332 +- 0.101 seconds time elapsed ( +- 3.04% ) After this change: ~# echo 1 > /sys/kernel/tracing/function_profile_enabled ~# perf stat -r 10 ./hackbench 10 Time: 1.869 Time: 1.428 Time: 1.575 Time: 1.569 Time: 1.685 Time: 1.511 Time: 1.611 Time: 1.672 Time: 1.724 Time: 1.715 Performance counter stats for '/work/c/hackbench 10' (10 runs): 13,578.21 msec task-clock # 6.931 CPUs utilized ( +- 2.23% ) 12,736 context-switches # 937.973 /sec ( +- 3.86% ) 341 cpu-migrations # 25.114 /sec ( +- 5.27% ) 11,378 page-faults # 837.960 /sec ( +- 1.74% ) 27,638,039,036 cycles # 2.035 GHz ( +- 0.27% ) 45,107,762,498 instructions # 1.63 insn per cycle ( +- 0.23% ) 8,623,868,018 branches # 635.125 M/sec ( +- 0.27% ) 125,738,443 branch-misses # 1.46% of all branches ( +- 0.32% ) 1.9590 +- 0.0484 seconds time elapsed ( +- 2.47% ) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20241223184941.373853944@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87d..63a9ffa65e17 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -789,27 +789,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; - unsigned long flags; if (!ftrace_profile_enabled) return; - local_irq_save(flags); + guard(preempt_notrace)(); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; rec = ftrace_find_profiled_func(stat, ip); if (!rec) { rec = ftrace_profile_alloc(stat, ip); if (!rec) - goto out; + return; } rec->counter++; - out: - local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -856,19 +853,19 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, unsigned long long calltime; unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; - unsigned long flags; int size; - local_irq_save(flags); + guard(preempt_notrace)(); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; profile_data = fgraph_retrieve_data(gops->idx, &size); /* If the calltime was zero'd ignore it */ if (!profile_data || !profile_data->calltime) - goto out; + return; calltime = rettime - profile_data->calltime; @@ -896,9 +893,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, rec->time += calltime; rec->time_squared += calltime * calltime; } - - out: - local_irq_restore(flags); } static struct fgraph_ops fprofiler_ops = { From 77e53cb2fcf2c5e81f06f4a41783d8752cf9d038 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Dec 2024 13:46:21 -0500 Subject: [PATCH 08/56] ftrace: Remove unneeded goto jumps There are some goto jumps to exit a program to just return a value. The code after the label doesn't free anything nor does it do any unlocks. It simply returns the variable that was set before the jump. Remove these unneeded goto jumps. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20241223184941.544855549@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 63a9ffa65e17..2c1691aa1d2f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1669,14 +1669,12 @@ unsigned long ftrace_location(unsigned long ip) loc = ftrace_location_range(ip, ip); if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) - goto out; + return 0; /* map sym+0 to __fentry__ */ if (!offset) loc = ftrace_location_range(ip, ip + size - 1); } - -out: return loc; } @@ -2071,7 +2069,7 @@ rollback: continue; if (rec == end) - goto err_out; + return -EBUSY; in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -2084,7 +2082,6 @@ rollback: rec->flags |= FTRACE_FL_IPMODIFY; } while_for_each_ftrace_rec(); -err_out: return -EBUSY; } @@ -5720,12 +5717,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out; + return ret; } - ret = read; - out: - return ret; + return read; } ssize_t From 1d95fd9d6b1a1750da2dec1485a4c58f5a7d3ebd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Dec 2024 13:46:22 -0500 Subject: [PATCH 09/56] ftrace: Switch ftrace.c code over to use guard() There are a few functions in ftrace.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20241223184941.718001540@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 101 +++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c1691aa1d2f..6ebc76bafd38 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -536,24 +536,21 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } + if (unlikely(rec->counter == 0)) + return -EBUSY; #ifdef CONFIG_FUNCTION_GRAPH_TRACER avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) - goto out; + return 0; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -590,10 +587,8 @@ static int function_stat_show(struct seq_file *m, void *v) trace_print_seq(m, &s); #endif seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - return ret; + return 0; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -944,20 +939,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, val = !!val; - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; @@ -968,8 +959,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_profiler(); } } - out: - mutex_unlock(&ftrace_profile_lock); *ppos += cnt; @@ -5610,20 +5599,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex); __init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; - int ret = 0; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return 0; } /* @@ -5633,20 +5617,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) __init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; - int ret = -ENODEV; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -ENODEV; } static int ftrace_process_regex(struct ftrace_iterator *iter, @@ -5656,7 +5637,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); @@ -5673,17 +5654,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - mutex_lock(&ftrace_cmd_mutex); - list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(tr, hash, func, command, next, enable); - goto out_unlock; - } - } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); - return ret; + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(p->name, command) == 0) + return p->func(tr, hash, func, command, next, enable); + } + + return -EINVAL; } static ssize_t @@ -8280,7 +8258,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); switch (type) { case TRACE_PIDS: @@ -8296,14 +8274,13 @@ pid_write(struct file *filp, const char __user *ubuf, lockdep_is_held(&ftrace_lock)); break; default: - ret = -EINVAL; WARN_ON_ONCE(1); - goto out; + return -EINVAL; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; switch (type) { case TRACE_PIDS: @@ -8332,11 +8309,8 @@ pid_write(struct file *filp, const char __user *ubuf, ftrace_update_pid_func(); ftrace_startup_all(0); - out: - mutex_unlock(&ftrace_lock); - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -8739,17 +8713,17 @@ static int ftrace_enable_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret = -ENODEV; + int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out; + return -ENODEV; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; + return ret; if (ftrace_enabled) { @@ -8763,8 +8737,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } else { if (is_permanent_ops_registered()) { ftrace_enabled = true; - ret = -EBUSY; - goto out; + return -EBUSY; } /* stopping ftrace calls (just send to ftrace_stub) */ @@ -8774,9 +8747,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } last_ftrace_enabled = !!ftrace_enabled; - out: - mutex_unlock(&ftrace_lock); - return ret; + return 0; } static struct ctl_table ftrace_sysctls[] = { From d576aec24df9f58ed0ebe2ff854daafe837f0225 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 10 Dec 2024 11:08:57 +0900 Subject: [PATCH 10/56] fgraph: Get ftrace recursion lock in function_graph_enter Get the ftrace recursion lock in the generic function_graph_enter() instead of each architecture code. This changes all function_graph tracer callbacks running in non-preemptive state. On x86 and powerpc, this is by default, but on the other architecutres, this will be new. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173379653720.973433.18438622234884980494.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- arch/powerpc/kernel/trace/ftrace.c | 6 ------ arch/powerpc/kernel/trace/ftrace_64_pg.c | 6 ------ arch/x86/kernel/ftrace.c | 7 ------- kernel/trace/fgraph.c | 8 +++++++- 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 5ccd791761e8..e41daf2c4a31 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -658,7 +658,6 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long sp = arch_ftrace_regs(fregs)->regs.gpr[1]; - int bit; if (unlikely(ftrace_graph_is_dead())) goto out; @@ -666,14 +665,9 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) - goto out; - if (!function_graph_enter(parent_ip, ip, 0, (unsigned long *)sp)) parent_ip = ppc_function_entry(return_to_handler); - ftrace_test_recursion_unlock(bit); out: arch_ftrace_regs(fregs)->regs.link = parent_ip; } diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index 98787376eb87..8fb860b90ae1 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -790,7 +790,6 @@ static unsigned long __prepare_ftrace_return(unsigned long parent, unsigned long ip, unsigned long sp) { unsigned long return_hooker; - int bit; if (unlikely(ftrace_graph_is_dead())) goto out; @@ -798,16 +797,11 @@ __prepare_ftrace_return(unsigned long parent, unsigned long ip, unsigned long sp if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; - bit = ftrace_test_recursion_trylock(ip, parent); - if (bit < 0) - goto out; - return_hooker = ppc_function_entry(return_to_handler); if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; - ftrace_test_recursion_unlock(bit); out: return parent; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4dd0ad6c94d6..33f50c80f481 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -615,7 +615,6 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; - int bit; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -635,14 +634,8 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - bit = ftrace_test_recursion_trylock(ip, *parent); - if (bit < 0) - return; - if (!function_graph_enter(*parent, ip, frame_pointer, parent)) *parent = return_hooker; - - ftrace_test_recursion_unlock(bit); } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ddedcb50917f..5c68d6109119 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -650,8 +650,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_graph_ent trace; unsigned long bitmap = 0; int offset; + int bit; int i; + bit = ftrace_test_recursion_trylock(func, ret); + if (bit < 0) + return -EBUSY; + trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -697,12 +702,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, * flag, set that bit always. */ set_bitmap(current, offset, bitmap | BIT(0)); - + ftrace_test_recursion_unlock(bit); return 0; out_ret: current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1; out: current->curr_ret_depth--; + ftrace_test_recursion_unlock(bit); return -EBUSY; } From d33b10c0c73adca00f72bf4a153a07b7f5f34715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Dec 2024 22:14:13 -0500 Subject: [PATCH 11/56] tracing: Switch trace.c code over to use guard() There are several functions in trace.c that have "goto out;" or equivalent on error in order to release locks or free values that were allocated. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex or freeing on error over to using the guard(mutex)() and __free() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. There's one place that should probably return an error but instead return 0. This does not change the return as the only changes are to do the conversion without changing the logic. Fixing that location will have to come later. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Andrew Morton Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/20241224221413.7b8c68c3@batman.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 266 +++++++++++++++---------------------------- 1 file changed, 94 insertions(+), 172 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 957f941a08e7..e6e1de69af01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -535,19 +536,16 @@ LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; - int ret = -ENODEV; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; - ret = 0; - break; + return 0; } } - mutex_unlock(&trace_types_lock); - return ret; + return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) @@ -1443,22 +1441,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { - struct cond_snapshot *cond_snapshot; - int ret = 0; + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + int ret; - cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; /* * The cond_snapshot can only change to NULL without the @@ -1468,29 +1464,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ - if (tr->cond_snapshot) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->cond_snapshot) + return -EBUSY; ret = tracing_arm_snapshot_locked(tr); if (ret) - goto fail_unlock; + return ret; local_irq_disable(); arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = cond_snapshot; + tr->cond_snapshot = no_free_ptr(cond_snapshot); arch_spin_unlock(&tr->max_lock); local_irq_enable(); - mutex_unlock(&trace_types_lock); - - return ret; - - fail_unlock: - mutex_unlock(&trace_types_lock); - kfree(cond_snapshot); - return ret; + return 0; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); @@ -2203,10 +2190,10 @@ static __init int init_trace_selftests(void) selftests_can_run = true; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) - goto out; + return 0; pr_info("Running postponed tracer tests:\n"); @@ -2235,9 +2222,6 @@ static __init int init_trace_selftests(void) } tracing_selftest_running = false; - out: - mutex_unlock(&trace_types_lock); - return 0; } core_initcall(init_trace_selftests); @@ -2807,7 +2791,7 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, int save_tracepoint_printk; int ret; - mutex_lock(&tracepoint_printk_mutex); + guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -2820,16 +2804,13 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) - goto out; + return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); - out: - mutex_unlock(&tracepoint_printk_mutex); - return ret; } @@ -5123,7 +5104,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) u32 tracer_flags; int i; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; @@ -5140,7 +5122,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) else seq_printf(m, "no%s\n", trace_opts[i].name); } - mutex_unlock(&trace_types_lock); return 0; } @@ -5805,7 +5786,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -5829,8 +5810,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, map_array++; } memset(map_array, 0, sizeof(*map_array)); - - mutex_unlock(&trace_eval_mutex); } static void trace_create_eval_file(struct dentry *d_tracer) @@ -5994,23 +5973,18 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, { int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ - if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { - ret = -EINVAL; - goto out; - } + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) + return -EINVAL; } ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; -out: - mutex_unlock(&trace_types_lock); - return ret; } @@ -6102,9 +6076,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif - int ret = 0; + int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); @@ -6112,7 +6086,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) - goto out; + return ret; ret = 0; } @@ -6120,12 +6094,11 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (strcmp(t->name, buf) == 0) break; } - if (!t) { - ret = -EINVAL; - goto out; - } + if (!t) + return -EINVAL; + if (t == tr->current_trace) - goto out; + return 0; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { @@ -6136,27 +6109,23 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); - goto out; + return 0; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) { - ret = -EINVAL; - goto out; - } + if (!trace_ok_for_array(t, tr)) + return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ - if (tr->trace_ref) { - ret = -EBUSY; - goto out; - } + if (tr->trace_ref) + return -EBUSY; trace_branch_disable(); @@ -6187,7 +6156,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (!had_max_tr && t->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) - goto out; + return ret; } #else tr->current_trace = &nop_trace; @@ -6200,17 +6169,15 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->use_max_tr) tracing_disarm_snapshot(tr); #endif - goto out; + return ret; } } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } static ssize_t @@ -6288,22 +6255,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) - goto out; + return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) - goto out; + return ret; } - ret = cnt; -out: - mutex_unlock(&trace_types_lock); - - return ret; + return cnt; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -6522,31 +6485,29 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, * This is just a matter of traces coherency, the ring buffer itself * is protected. */ - mutex_lock(&iter->mutex); + guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) - goto out; + return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) - goto out; + return sret; } waitagain: sret = tracing_wait_pipe(filp); if (sret <= 0) - goto out; + return sret; /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } + if (trace_empty(iter)) + return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; @@ -6610,9 +6571,6 @@ waitagain: if (sret == -EBUSY) goto waitagain; -out: - mutex_unlock(&iter->mutex); - return sret; } @@ -7204,25 +7162,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve */ int tracing_set_filter_buffering(struct trace_array *tr, bool set) { - int ret = 0; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (set && tr->no_filter_buffering_ref++) - goto out; + return 0; if (!set) { - if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; --tr->no_filter_buffering_ref; } - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } struct ftrace_buffer_info { @@ -7298,12 +7250,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; local_irq_disable(); arch_spin_lock(&tr->max_lock); @@ -7312,24 +7262,20 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; switch (val) { case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, @@ -7337,7 +7283,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = tracing_arm_snapshot_locked(tr); if (ret) - break; + return ret; /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { @@ -7364,8 +7310,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; ret = cnt; } -out: - mutex_unlock(&trace_types_lock); + return ret; } @@ -7751,12 +7696,11 @@ void tracing_log_err(struct trace_array *tr, len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + err = get_tracing_log_err(tr, len); - if (PTR_ERR(err) == -ENOMEM) { - mutex_unlock(&tracing_err_log_lock); + if (PTR_ERR(err) == -ENOMEM) return; - } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); @@ -7767,7 +7711,6 @@ void tracing_log_err(struct trace_array *tr, err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); - mutex_unlock(&tracing_err_log_lock); } static void clear_tracing_err_log(struct trace_array *tr) @@ -9511,20 +9454,17 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) - goto out_unlock; + return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); -out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return ret; } @@ -9574,24 +9514,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system { struct trace_array *tr; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; + if (tr->name && strcmp(tr->name, name) == 0) { + tr->ref++; + return tr; + } } tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; -out_unlock: - if (tr) + else tr->ref++; - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); @@ -9642,48 +9581,36 @@ static int __remove_instance(struct trace_array *tr) int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; - int ret; if (!this_tr) return -EINVAL; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == this_tr) { - ret = __remove_instance(tr); - break; - } + if (tr == this_tr) + return __remove_instance(tr); } - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - - return ret; + return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; - int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; tr = trace_array_find(name); - if (tr) - ret = __remove_instance(tr); + if (!tr) + return -ENODEV; - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - - return ret; + return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) @@ -9696,19 +9623,16 @@ static __init void create_trace_instances(struct dentry *d_tracer) if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) - break; + return; } - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); } static void @@ -9922,7 +9846,7 @@ static void trace_module_remove_evals(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; @@ -9934,12 +9858,10 @@ static void trace_module_remove_evals(struct module *mod) map = map->tail.next; } if (!map) - goto out; + return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); - out: - mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_evals(struct module *mod) { } From d1e27ee9c6f21ccbb3f2d910171427ceb66a0af1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:00 -0500 Subject: [PATCH 12/56] tracing: Return -EINVAL if a boot tracer tries to enable the mmiotracer at boot The mmiotracer is not set to be enabled at boot up from the kernel command line. If the boot command line tries to enable that tracer, it will fail to be enabled. The return code is currently zero when that happens so the caller just thinks it was enabled. Return -EINVAL in this case. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201344.854254394@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6e1de69af01..0aaf442271e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6116,7 +6116,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); - return 0; + return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ From cad1d5bd2cb9921189749b5d796026c768f56236 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:01 -0500 Subject: [PATCH 13/56] tracing: Have event_enable_write() just return error on error The event_enable_write() function is inconsistent in how it returns errors. Sometimes it updates the ppos parameter and sometimes it doesn't. Simplify the code to just return an error or the count if there isn't an error. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201345.025284170@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1545cc8b49d0..f4eff49faef6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1549,18 +1549,18 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: case 1: - ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_file(filp); if (likely(file)) { ret = tracing_update_buffers(file->tr); - if (ret < 0) { - mutex_unlock(&event_mutex); - return ret; - } - ret = ftrace_event_enable_disable(file, val); + if (ret >= 0) + ret = ftrace_event_enable_disable(file, val); + } else { + ret = -ENODEV; } mutex_unlock(&event_mutex); + if (ret < 0) + return ret; break; default: @@ -1569,7 +1569,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return ret ? ret : cnt; + return cnt; } static ssize_t From c949dfb97443b0aee0cfe138049a17e66bbc62e9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:02 -0500 Subject: [PATCH 14/56] tracing: Simplify event_enable_func() goto out_free logic The event_enable_func() function allocates the data descriptor early in the function just to assign its data->count value via: kstrtoul(number, 0, &data->count); This makes the code more complex as there are several error paths before the data descriptor is actually used. This means there needs to be a goto out_free; to clean it up. Use a local variable "count" to do the update and move the data allocation just before it is used. This removes the "out_free" label as the data can be freed on the failure path of where it is used. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201345.190820140@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f4eff49faef6..43e9545b5cf3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3758,6 +3758,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; + unsigned long count = -1; const char *system; const char *event; char *number; @@ -3798,14 +3799,6 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - data->enable = enable; - data->count = -1; - data->file = file; - if (!param) goto out_reg; @@ -3813,28 +3806,36 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, ret = -EINVAL; if (!strlen(number)) - goto out_free; + goto out; /* * We use the callback data field (which is a pointer) * as our counter. */ - ret = kstrtoul(number, 0, &data->count); + ret = kstrtoul(number, 0, &count); if (ret) - goto out_free; + goto out; out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; - goto out_free; + goto out; } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_put; + + data->enable = enable; + data->count = count; + data->file = file; + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, @@ -3853,11 +3854,10 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_disable: + kfree(data); __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); - out_free: - kfree(data); goto out; } From 4b8d63e5b61dc2ee7958fb36d41c643f56de0d4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:03 -0500 Subject: [PATCH 15/56] tracing: Simplify event_enable_func() goto_reg logic Currently there's an "out_reg:" label that gets jumped to if there's no parameters to process. Instead, make it a proper "if (param) { }" block as there's not much to do for the parameter processing, and remove the "out_reg:" label. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201345.354746196@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 43e9545b5cf3..86db6ee6f26c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3799,24 +3799,22 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, ret = -ENOMEM; - if (!param) - goto out_reg; + if (param) { + number = strsep(¶m, ":"); - number = strsep(¶m, ":"); + ret = -EINVAL; + if (!strlen(number)) + goto out; - ret = -EINVAL; - if (!strlen(number)) - goto out; + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &count); + if (ret) + goto out; + } - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &count); - if (ret) - goto out; - - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { From 59980d9b0b2dbe8945734162bb3014eac8b885bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:04 -0500 Subject: [PATCH 16/56] tracing: Switch trace_events.c code over to use guard() There are several functions in trace_events.c that have "goto out;" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Some locations did some simple arithmetic after releasing the lock. As this causes no real overhead for holding a mutex while processing the file position (*ppos += cnt;) let the lock be held over this logic too. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201345.522546095@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 105 ++++++++++++++---------------------- 1 file changed, 39 insertions(+), 66 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 86db6ee6f26c..047d2775184b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1546,19 +1546,18 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; + guard(mutex)(&event_mutex); + switch (val) { case 0: case 1: - mutex_lock(&event_mutex); file = event_file_file(filp); - if (likely(file)) { - ret = tracing_update_buffers(file->tr); - if (ret >= 0) - ret = ftrace_event_enable_disable(file, val); - } else { - ret = -ENODEV; - } - mutex_unlock(&event_mutex); + if (!file) + return -ENODEV; + ret = tracing_update_buffers(file->tr); + if (ret < 0) + return ret; + ret = ftrace_event_enable_disable(file, val); if (ret < 0) return ret; break; @@ -2145,7 +2144,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, @@ -2161,7 +2160,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); @@ -2186,11 +2185,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); - out: - mutex_unlock(&event_mutex); - - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -3257,13 +3252,13 @@ int trace_add_event_call(struct trace_event_call *call) int ret; lockdep_assert_held(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __register_event(call, NULL); - if (ret >= 0) - __add_event_to_tracers(call); + if (ret < 0) + return ret; - mutex_unlock(&trace_types_lock); + __add_event_to_tracers(call); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); @@ -3517,30 +3512,21 @@ struct trace_event_file *trace_get_event_file(const char *instance, return ERR_PTR(ret); } - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); - ret = -EINVAL; - goto out; + return ERR_PTR(-EINVAL); } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); - ret = -EBUSY; - goto out; + return ERR_PTR(-EBUSY); } - ret = 0; - out: - mutex_unlock(&event_mutex); - - if (ret) - file = ERR_PTR(ret); - return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); @@ -3778,12 +3764,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, event = strsep(¶m, ":"); - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) - goto out; + return -EINVAL; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; @@ -3792,19 +3777,14 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - goto out; - } - - ret = -ENOMEM; + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (param) { number = strsep(¶m, ":"); - ret = -EINVAL; if (!strlen(number)) - goto out; + return -EINVAL; /* * We use the callback data field (which is a pointer) @@ -3812,20 +3792,19 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, */ ret = kstrtoul(number, 0, &count); if (ret) - goto out; + return ret; } /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); - if (!ret) { - ret = -EBUSY; - goto out; - } + if (!ret) + return -EBUSY; ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out_put; @@ -3840,23 +3819,20 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) - goto out_disable; - /* Just return zero, not the number of enabled functions */ - ret = 0; - out: - mutex_unlock(&event_mutex); - return ret; - out_disable: + /* Just return zero, not the number of enabled functions */ + if (ret > 0) + return 0; + kfree(data); + + if (!ret) + ret = -ENOENT; + __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); - goto out; + return ret; } static struct ftrace_func_command event_enable_cmd = { @@ -4079,20 +4055,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + return ret; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } /* Must be called with event_mutex held */ From 2b36a97aeeb71b1e4a48bfedc7f21f44aeb1e6fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:05 -0500 Subject: [PATCH 17/56] tracing: Switch trace_events_hist.c code over to use guard() There are a couple functions in trace_events_hist.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201345.694601480@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9c058aa8baf3..879b58892b9d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5594,25 +5594,19 @@ static int hist_show(struct seq_file *m, void *v) { struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } - - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } static int event_hist_open(struct inode *inode, struct file *file) @@ -5873,25 +5867,19 @@ static int hist_debug_show(struct seq_file *m, void *v) { struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_debug_show(m, data, n++); } - - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } static int event_hist_debug_open(struct inode *inode, struct file *file) From 63c72641683891c5087c77e9ae7a8b43433214e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Dec 2024 11:06:21 -0500 Subject: [PATCH 18/56] tracing: Switch trace_events_trigger.c code over to use guard() There are a few functions in trace_events_trigger.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Also use __free() for free a temporary buffer in event_trigger_regex_write(). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Link: https://lore.kernel.org/20241220110621.639d3bc8@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 75 +++++++++++------------------ 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a5e3d6acf1e1..d45448947094 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - if (unlikely(!event_file_file(file))) { - mutex_unlock(&event_mutex); + if (unlikely(!event_file_file(file))) return -ENODEV; - } if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) } } - mutex_unlock(&event_mutex); - return ret; } @@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next; struct event_command *p; - int ret = -EINVAL; next = buff = skip_spaces(buff); command = strsep(&next, ": \t"); @@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) } command = (command[0] != '!') ? command : command + 1; - mutex_lock(&trigger_cmd_mutex); - list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->parse(p, file, buff, command, next); - goto out_unlock; - } - } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); - return ret; + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) + return p->parse(p, file, buff, command, next); + } + + return -EINVAL; } static ssize_t event_trigger_regex_write(struct file *file, @@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file, { struct trace_event_file *event_file; ssize_t ret; - char *buf; + char *buf __free(kfree) = NULL; if (!cnt) return 0; @@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); - mutex_lock(&event_mutex); - event_file = event_file_file(file); - if (unlikely(!event_file)) { - mutex_unlock(&event_mutex); - kfree(buf); - return -ENODEV; - } - ret = trigger_process_regex(event_file, buf); - mutex_unlock(&event_mutex); + guard(mutex)(&event_mutex); - kfree(buf); + event_file = event_file_file(file); + if (unlikely(!event_file)) + return -ENODEV; + + ret = trigger_process_regex(event_file, buf); if (ret < 0) - goto out; + return ret; *ppos += cnt; - ret = cnt; - out: - return ret; + return cnt; } static int event_trigger_regex_release(struct inode *inode, struct file *file) @@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = { __init int register_event_command(struct event_command *cmd) { struct event_command *p; - int ret = 0; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &trigger_commands); - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return 0; } /* @@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd) __init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; - int ret = -ENODEV; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -ENODEV; } /** From 076796f74eac6eec2da6168836ff6baa8d878297 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:08 -0500 Subject: [PATCH 19/56] tracing: Switch trace_events_filter.c code over to use guard() There are a couple functions in trace_events_filter.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201346.200737679@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78051de581e7..0993dfc1c5c1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, struct event_filter *filter = NULL; int err = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Make sure the system still has events */ - if (!dir->nr_events) { - err = -ENODEV; - goto out_unlock; - } + if (!dir->nr_events) + return -ENODEV; if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(dir, tr); @@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); - goto out_unlock; + return 0; } err = create_system_filter(dir, filter_string, &filter); @@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, __free_filter(system->filter); system->filter = filter; } -out_unlock: - mutex_unlock(&event_mutex); return err; } @@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, struct event_filter *filter = NULL; struct trace_event_call *call; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); call = event->tp_event; - err = -EINVAL; if (!call) - goto out_unlock; + return -EINVAL; - err = -EEXIST; if (event->filter) - goto out_unlock; + return -EEXIST; err = create_filter(NULL, call, filter_str, false, &filter); if (err) @@ -2637,9 +2631,6 @@ free_filter: if (err || ftrace_event_is_function(call)) __free_filter(filter); -out_unlock: - mutex_unlock(&event_mutex); - return err; } From a2e27e1bb19eb7c1790af7c8b6f7298ec524c1bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:09 -0500 Subject: [PATCH 20/56] tracing: Switch trace_events_synth.c code over to use guard() There are a couple functions in trace_events_synth.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201346.371082515@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c82b401a294d..e3f7d09e5512 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -49,16 +49,11 @@ static char *last_cmd; static int errpos(const char *str) { - int ret = 0; - - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!str || !last_cmd) - goto out; + return 0; - ret = err_pos(last_cmd, str); - out: - mutex_unlock(&lastcmd_mutex); - return ret; + return err_pos(last_cmd, str); } static void last_cmd_set(const char *str) @@ -74,14 +69,12 @@ static void last_cmd_set(const char *str) static void synth_err(u8 err_type, u16 err_pos) { - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!last_cmd) - goto out; + return; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); - out: - mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); From 930d2b32c0af6895ba4c6ca6404e7f7b6dc214ed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Dec 2024 17:25:41 -0500 Subject: [PATCH 21/56] tracing: Switch trace_osnoise.c code over to use guard() and __free() The osnoise_hotplug_workfn() grabs two mutexes and cpu_read_lock(). It has various gotos to handle unlocking them. Switch them over to guard() and let the compiler worry about it. The osnoise_cpus_read() has a temporary mask_str allocated and there's some gotos to make sure it gets freed on error paths. Switch that over to __free() to let the compiler worry about it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241225222931.517329690@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 40 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b9f96c77527d..b25c30b05dd0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2083,26 +2083,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(mutex)(&interface_lock); + guard(cpus_read_lock)(); if (!cpu_online(cpu)) - goto out_unlock; + return; + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -2300,31 +2295,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } From 6c05353e4ff5875807f1a00f8d95e68b3d1e4d7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Dec 2024 17:25:42 -0500 Subject: [PATCH 22/56] tracing: Switch trace_stack.c code over to use guard() The function stack_trace_sysctl() uses a goto on the error path to jump to the mutex_unlock() code. Replace the logic to use guard() and let the compiler worry about it. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241225222931.684913592@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_stack.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 7f9572a37333..14c6f272c4d8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, int was_enabled; int ret; - mutex_lock(&stack_sysctl_mutex); + guard(mutex)(&stack_sysctl_mutex); was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (was_enabled == !!stack_tracer_enabled)) - goto out; + return ret; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: - mutex_unlock(&stack_sysctl_mutex); return ret; } From 08b767317192e7a20d6d95ff7eca6d9bbc48c192 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Dec 2024 15:12:12 -0500 Subject: [PATCH 23/56] tracing: Switch trace_stat.c code over to use guard() There are a couple functions in trace_stat.c that have "goto out" or equivalent on error in order to release locks that were taken. This can be error prone or just simply make the code more complex. Switch every location that ends with unlocking a mutex on error over to using the guard(mutex)() infrastructure to let the compiler worry about releasing locks. This makes the code easier to read and understand. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Link: https://lore.kernel.org/20241219201346.870318466@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_stat.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index bb247beec447..b3b5586f104d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session) int ret = 0; int i; - mutex_lock(&session->stat_mutex); + guard(mutex)(&session->stat_mutex); __reset_stat_session(session); if (!ts->stat_cmp) @@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session) stat = ts->stat_start(ts); if (!stat) - goto exit; + return 0; ret = insert_stat(root, stat, ts->stat_cmp); if (ret) - goto exit; + return ret; /* * Iterate over the tracer stat entries and store them in an rbtree. @@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session) goto exit_free_rbtree; } -exit: - mutex_unlock(&session->stat_mutex); return ret; exit_free_rbtree: __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); return ret; } @@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret = -EINVAL; + int ret; if (!trace) return -EINVAL; @@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace) if (!trace->stat_start || !trace->stat_next || !trace->stat_show) return -EINVAL; + guard(mutex)(&all_stat_sessions_mutex); + /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) - goto out; + return -EINVAL; } - ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - goto out; + return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - goto out; + return ret; } - ret = 0; /* Register */ list_add_tail(&session->session_list, &all_stat_sessions); - out: - mutex_unlock(&all_stat_sessions_mutex); - return ret; + return 0; } void unregister_stat_tracer(struct tracer_stat *trace) From 9e49ca756d207f4313fb7af48648a67da8e4e250 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Dec 2024 10:33:13 -0500 Subject: [PATCH 24/56] tracing/string: Create and use __free(argv_free) in trace_dynevent.c The function dyn_event_release() uses argv_split() which must be freed via argv_free(). It contains several error paths that do a goto out to call argv_free() for cleanup. This makes the code complex and error prone. Create a new __free() directive __free(argv_free) that will call argv_free() for data allocated with argv_split(), and use it in the dyn_event_release() function. Cc: Kees Cook Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Andy Shevchenko Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241220103313.4a74ec8e@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 3 +++ kernel/trace/trace_dynevent.c | 23 +++++++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c77..86d5d352068b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -4,6 +4,7 @@ #include #include +#include /* for DEFINE_FREE() */ #include /* for inline */ #include /* for size_t */ #include /* for NULL */ @@ -312,6 +313,8 @@ extern void *kmemdup_array(const void *src, size_t count, size_t element_size, g extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) + /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..a322e4f249a5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type struct dyn_event *pos, *n; char *system = NULL, *event, *p; int argc, ret = -ENOENT; - char **argv; + char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc); - argv = argv_split(GFP_KERNEL, raw_command, &argc); if (!argv) return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') { - ret = -EINVAL; - goto out; - } + if (argv[0][1] != ':') + return -EINVAL; event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) { - ret = -EINVAL; - goto out; - } + if (!event) + return -EINVAL; event++; } @@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (!system && event[0] == '\0') { - ret = -EINVAL; - goto out; - } + if (!system && event[0] == '\0') + return -EINVAL; mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { @@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type } tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); -out: - argv_free(argv); return ret; } From cff6d93eab00bacf8b6bffdef775fc2de0273c96 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Thu, 12 Dec 2024 13:12:37 +0000 Subject: [PATCH 25/56] tracepoint: Reduce duplication of __DO_TRACE_CALL The logic for invoking __DO_TRACE_CALL was extracted to a static inline function called __rust_do_trace_##name so that Rust can call it directly. This logic does not include the static branch, to avoid a function call when the tracepoint is disabled. Since the C code needs to perform the same logic after checking the static key, this logic is currently duplicated. Thus, remove this duplication by having C call the static inline function too. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241212131237.1988409-1-aliceryhl@google.com Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 76d9055b2cff..a351763e6965 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -218,7 +218,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DEFINE_RUST_DO_TRACE(name, proto, args) \ notrace void rust_do_trace_##name(proto) \ { \ - __rust_do_trace_##name(args); \ + __do_trace_##name(args); \ } /* @@ -268,7 +268,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ if (cond) { \ guard(preempt_notrace)(); \ @@ -277,12 +277,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ static inline void trace_##name(proto) \ { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - if (cond) { \ - guard(preempt_notrace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ @@ -291,7 +287,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -299,10 +295,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline void trace_##name(proto) \ { \ might_fault(); \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - guard(rcu_tasks_trace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ From 6c432b56a16a0727561211a137f37ec47f96f1d0 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:45 +0100 Subject: [PATCH 26/56] verification/dot2k: Fix template directory detection dot2k can be run as installed (e.g. make install) or from the kernel tree. In the former case it looks for templates in a known location; in the latter, the PWD has to be `/tools/verification` to properly import python modules. The current version looks for the template in a wrong directory in this latter case. This patch adjusts the directory where dot2k looks for templates if run from the kernel tree (i.e. not installed). Additionally we fix a few simple pylint warnings in boolean expressions. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-2-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/dot2k.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index 016550fccf1f..f6d02e3406a3 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -14,14 +14,14 @@ import os class dot2k(Dot2c): monitor_types = { "global" : 1, "per_cpu" : 2, "per_task" : 3 } - monitor_templates_dir = "dot2k/rv_templates/" + monitor_templates_dir = "dot2/dot2k_templates/" monitor_type = "per_cpu" def __init__(self, file_path, MonitorType): super().__init__(file_path) self.monitor_type = self.monitor_types.get(MonitorType) - if self.monitor_type == None: + if self.monitor_type is None: raise Exception("Unknown monitor type: %s" % MonitorType) self.monitor_type = MonitorType @@ -31,7 +31,7 @@ class dot2k(Dot2c): def __fill_rv_templates_dir(self): - if os.path.exists(self.monitor_templates_dir) == True: + if os.path.exists(self.monitor_templates_dir): return if platform.system() != "Linux": @@ -39,11 +39,11 @@ class dot2k(Dot2c): kernel_path = "/lib/modules/%s/build/tools/verification/dot2/dot2k_templates/" % (platform.release()) - if os.path.exists(kernel_path) == True: + if os.path.exists(kernel_path): self.monitor_templates_dir = kernel_path return - if os.path.exists("/usr/share/dot2/dot2k_templates/") == True: + if os.path.exists("/usr/share/dot2/dot2k_templates/"): self.monitor_templates_dir = "/usr/share/dot2/dot2k_templates/" return @@ -98,7 +98,7 @@ class dot2k(Dot2c): def fill_main_c(self): main_c = self.main_c min_type = self.get_minimun_type() - nr_events = self.events.__len__() + nr_events = len(self.events) tracepoint_handlers = self.fill_tracepoint_handlers_skel() tracepoint_attach = self.fill_tracepoint_attach_probe() tracepoint_detach = self.fill_tracepoint_detach_helper() @@ -160,8 +160,8 @@ class dot2k(Dot2c): def __get_main_name(self): path = "%s/%s" % (self.name, "main.c") - if os.path.exists(path) == False: - return "main.c" + if not os.path.exists(path): + return "main.c" return "__main.c" def print_files(self): From ca08e071c59d96cb1db19b20ba70e9db7b9d5791 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:46 +0100 Subject: [PATCH 27/56] verification/dot2k: Unify main.c templates dot2k has 3 templates, one per monitor type, but the only difference among them is the `DECLARE_DA_MON_*` call, keeping 3 almost identical templates requires more work whenever we introduce a change. This patch removes the 3 dot2k templates and replaces them with a generic one, we then adjust the model type from the script. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-3-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/dot2k.py | 7 +- .../dot2k_templates/{main_global.c => main.c} | 2 +- .../dot2/dot2k_templates/main_per_cpu.c | 91 ------------------- .../dot2/dot2k_templates/main_per_task.c | 91 ------------------- 4 files changed, 7 insertions(+), 184 deletions(-) rename tools/verification/dot2/dot2k_templates/{main_global.c => main.c} (97%) delete mode 100644 tools/verification/dot2/dot2k_templates/main_per_cpu.c delete mode 100644 tools/verification/dot2/dot2k_templates/main_per_task.c diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index f6d02e3406a3..15d6f7048f8d 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -26,7 +26,7 @@ class dot2k(Dot2c): self.monitor_type = MonitorType self.__fill_rv_templates_dir() - self.main_c = self.__open_file(self.monitor_templates_dir + "main_" + MonitorType + ".c") + self.main_c = self.__open_file(self.monitor_templates_dir + "main.c") self.enum_suffix = "_%s" % self.name def __fill_rv_templates_dir(self): @@ -69,6 +69,9 @@ class dot2k(Dot2c): # cut off the last \n return string[:-1] + def fill_monitor_type(self): + return self.monitor_type.upper() + def fill_tracepoint_handlers_skel(self): buff = [] for event in self.events: @@ -97,12 +100,14 @@ class dot2k(Dot2c): def fill_main_c(self): main_c = self.main_c + monitor_type = self.fill_monitor_type() min_type = self.get_minimun_type() nr_events = len(self.events) tracepoint_handlers = self.fill_tracepoint_handlers_skel() tracepoint_attach = self.fill_tracepoint_attach_probe() tracepoint_detach = self.fill_tracepoint_detach_helper() + main_c = main_c.replace("MONITOR_TYPE", monitor_type) main_c = main_c.replace("MIN_TYPE", min_type) main_c = main_c.replace("MODEL_NAME", self.name) main_c = main_c.replace("NR_EVENTS", str(nr_events)) diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main.c similarity index 97% rename from tools/verification/dot2/dot2k_templates/main_global.c rename to tools/verification/dot2/dot2k_templates/main.c index a5658bfb9044..2419a6f89cd8 100644 --- a/tools/verification/dot2/dot2k_templates/main_global.c +++ b/tools/verification/dot2/dot2k_templates/main.c @@ -28,7 +28,7 @@ * The rv monitor reference is needed for the monitor declaration. */ static struct rv_monitor rv_MODEL_NAME; -DECLARE_DA_MON_GLOBAL(MODEL_NAME, MIN_TYPE); +DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE); /* * This is the instrumentation part of the monitor. diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c deleted file mode 100644 index 03539a97633f..000000000000 --- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODULE_NAME "MODEL_NAME" - -/* - * XXX: include required tracepoint headers, e.g., - * #include - */ -#include - -/* - * This is the self-generated part of the monitor. Generally, there is no need - * to touch this section. - */ -#include "MODEL_NAME.h" - -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ -static struct rv_monitor rv_MODEL_NAME; -DECLARE_DA_MON_PER_CPU(MODEL_NAME, MIN_TYPE); - -/* - * This is the instrumentation part of the monitor. - * - * This is the section where manual work is required. Here the kernel events - * are translated into model's event. - * - */ -TRACEPOINT_HANDLERS_SKEL -static int enable_MODEL_NAME(void) -{ - int retval; - - retval = da_monitor_init_MODEL_NAME(); - if (retval) - return retval; - -TRACEPOINT_ATTACH - - return 0; -} - -static void disable_MODEL_NAME(void) -{ - rv_MODEL_NAME.enabled = 0; - -TRACEPOINT_DETACH - - da_monitor_destroy_MODEL_NAME(); -} - -/* - * This is the monitor register section. - */ -static struct rv_monitor rv_MODEL_NAME = { - .name = "MODEL_NAME", - .description = "auto-generated MODEL_NAME", - .enable = enable_MODEL_NAME, - .disable = disable_MODEL_NAME, - .reset = da_monitor_reset_all_MODEL_NAME, - .enabled = 0, -}; - -static int __init register_MODEL_NAME(void) -{ - rv_register_monitor(&rv_MODEL_NAME); - return 0; -} - -static void __exit unregister_MODEL_NAME(void) -{ - rv_unregister_monitor(&rv_MODEL_NAME); -} - -module_init(register_MODEL_NAME); -module_exit(unregister_MODEL_NAME); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("MODEL_NAME"); diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c deleted file mode 100644 index ffd92af87a86..000000000000 --- a/tools/verification/dot2/dot2k_templates/main_per_task.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODULE_NAME "MODEL_NAME" - -/* - * XXX: include required tracepoint headers, e.g., - * #include - */ -#include - -/* - * This is the self-generated part of the monitor. Generally, there is no need - * to touch this section. - */ -#include "MODEL_NAME.h" - -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ -static struct rv_monitor rv_MODEL_NAME; -DECLARE_DA_MON_PER_TASK(MODEL_NAME, MIN_TYPE); - -/* - * This is the instrumentation part of the monitor. - * - * This is the section where manual work is required. Here the kernel events - * are translated into model's event. - * - */ -TRACEPOINT_HANDLERS_SKEL -static int enable_MODEL_NAME(void) -{ - int retval; - - retval = da_monitor_init_MODEL_NAME(); - if (retval) - return retval; - -TRACEPOINT_ATTACH - - return 0; -} - -static void disable_MODEL_NAME(void) -{ - rv_MODEL_NAME.enabled = 0; - -TRACEPOINT_DETACH - - da_monitor_destroy_MODEL_NAME(); -} - -/* - * This is the monitor register section. - */ -static struct rv_monitor rv_MODEL_NAME = { - .name = "MODEL_NAME", - .description = "auto-generated MODEL_NAME", - .enable = enable_MODEL_NAME, - .disable = disable_MODEL_NAME, - .reset = da_monitor_reset_all_MODEL_NAME, - .enabled = 0, -}; - -static int __init register_MODEL_NAME(void) -{ - rv_register_monitor(&rv_MODEL_NAME); - return 0; -} - -static void __exit unregister_MODEL_NAME(void) -{ - rv_unregister_monitor(&rv_MODEL_NAME); -} - -module_init(register_MODEL_NAME); -module_exit(unregister_MODEL_NAME); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("MODEL_NAME"); From 91f3407e13b89b7391ebc5b6143fd22edd901041 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:47 +0100 Subject: [PATCH 28/56] verification/dot2k: More robust template variables The dot2k templates currently have variables that are automatically filled by the script marked as an uppercase VARIABLE. This requires some care while adding new variables to avoid using valid keywords and get them unexpectedly substituted. This patch switches the variables to the %%VARIABLE%% notation to make the pattern substitution more robust. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-4-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/dot2k.py | 14 +++--- .../verification/dot2/dot2k_templates/main.c | 50 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index 15d6f7048f8d..c88b3c011706 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -107,13 +107,13 @@ class dot2k(Dot2c): tracepoint_attach = self.fill_tracepoint_attach_probe() tracepoint_detach = self.fill_tracepoint_detach_helper() - main_c = main_c.replace("MONITOR_TYPE", monitor_type) - main_c = main_c.replace("MIN_TYPE", min_type) - main_c = main_c.replace("MODEL_NAME", self.name) - main_c = main_c.replace("NR_EVENTS", str(nr_events)) - main_c = main_c.replace("TRACEPOINT_HANDLERS_SKEL", tracepoint_handlers) - main_c = main_c.replace("TRACEPOINT_ATTACH", tracepoint_attach) - main_c = main_c.replace("TRACEPOINT_DETACH", tracepoint_detach) + main_c = main_c.replace("%%MONITOR_TYPE%%", monitor_type) + main_c = main_c.replace("%%MIN_TYPE%%", min_type) + main_c = main_c.replace("%%MODEL_NAME%%", self.name) + main_c = main_c.replace("%%NR_EVENTS%%", str(nr_events)) + main_c = main_c.replace("%%TRACEPOINT_HANDLERS_SKEL%%", tracepoint_handlers) + main_c = main_c.replace("%%TRACEPOINT_ATTACH%%", tracepoint_attach) + main_c = main_c.replace("%%TRACEPOINT_DETACH%%", tracepoint_detach) return main_c diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c index 2419a6f89cd8..4a05fef7f3c7 100644 --- a/tools/verification/dot2/dot2k_templates/main.c +++ b/tools/verification/dot2/dot2k_templates/main.c @@ -8,7 +8,7 @@ #include #include -#define MODULE_NAME "MODEL_NAME" +#define MODULE_NAME "%%MODEL_NAME%%" /* * XXX: include required tracepoint headers, e.g., @@ -20,15 +20,15 @@ * This is the self-generated part of the monitor. Generally, there is no need * to touch this section. */ -#include "MODEL_NAME.h" +#include "%%MODEL_NAME%%.h" /* * Declare the deterministic automata monitor. * * The rv monitor reference is needed for the monitor declaration. */ -static struct rv_monitor rv_MODEL_NAME; -DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE); +static struct rv_monitor rv_%%MODEL_NAME%%; +DECLARE_DA_MON_%%MONITOR_TYPE%%(%%MODEL_NAME%%, %%MIN_TYPE%%); /* * This is the instrumentation part of the monitor. @@ -37,55 +37,55 @@ DECLARE_DA_MON_MONITOR_TYPE(MODEL_NAME, MIN_TYPE); * are translated into model's event. * */ -TRACEPOINT_HANDLERS_SKEL -static int enable_MODEL_NAME(void) +%%TRACEPOINT_HANDLERS_SKEL%% +static int enable_%%MODEL_NAME%%(void) { int retval; - retval = da_monitor_init_MODEL_NAME(); + retval = da_monitor_init_%%MODEL_NAME%%(); if (retval) return retval; -TRACEPOINT_ATTACH +%%TRACEPOINT_ATTACH%% return 0; } -static void disable_MODEL_NAME(void) +static void disable_%%MODEL_NAME%%(void) { - rv_MODEL_NAME.enabled = 0; + rv_%%MODEL_NAME%%.enabled = 0; -TRACEPOINT_DETACH +%%TRACEPOINT_DETACH%% - da_monitor_destroy_MODEL_NAME(); + da_monitor_destroy_%%MODEL_NAME%%(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_MODEL_NAME = { - .name = "MODEL_NAME", - .description = "auto-generated MODEL_NAME", - .enable = enable_MODEL_NAME, - .disable = disable_MODEL_NAME, - .reset = da_monitor_reset_all_MODEL_NAME, +static struct rv_monitor rv_%%MODEL_NAME%% = { + .name = "%%MODEL_NAME%%", + .description = "auto-generated %%MODEL_NAME%%", + .enable = enable_%%MODEL_NAME%%, + .disable = disable_%%MODEL_NAME%%, + .reset = da_monitor_reset_all_%%MODEL_NAME%%, .enabled = 0, }; -static int __init register_MODEL_NAME(void) +static int __init register_%%MODEL_NAME%%(void) { - rv_register_monitor(&rv_MODEL_NAME); + rv_register_monitor(&rv_%%MODEL_NAME%%); return 0; } -static void __exit unregister_MODEL_NAME(void) +static void __exit unregister_%%MODEL_NAME%%(void) { - rv_unregister_monitor(&rv_MODEL_NAME); + rv_unregister_monitor(&rv_%%MODEL_NAME%%); } -module_init(register_MODEL_NAME); -module_exit(unregister_MODEL_NAME); +module_init(register_%%MODEL_NAME%%); +module_exit(unregister_%%MODEL_NAME%%); MODULE_LICENSE("GPL"); MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("MODEL_NAME"); +MODULE_DESCRIPTION("%%MODEL_NAME%%"); From 64b3e5f0d45329bc593e13b64dcdcf836da006cd Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:48 +0100 Subject: [PATCH 29/56] verification/dot2k: Add support for name and description options The dot2k command includes options to set a model name with -n and a description with -D, however those are not used in practice. This patch allows to specify a custom model name (by default the name of the dot file without extension) and a description which overrides the one in the C file. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-5-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/automata.py | 4 ++-- tools/verification/dot2/dot2c.py | 4 ++-- tools/verification/dot2/dot2k | 6 +----- tools/verification/dot2/dot2k.py | 8 +++++--- tools/verification/dot2/dot2k_templates/main.c | 4 ++-- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/verification/dot2/automata.py b/tools/verification/dot2/automata.py index bdeb98baa8b0..f6921cf3c914 100644 --- a/tools/verification/dot2/automata.py +++ b/tools/verification/dot2/automata.py @@ -19,9 +19,9 @@ class Automata: invalid_state_str = "INVALID_STATE" - def __init__(self, file_path): + def __init__(self, file_path, model_name=None): self.__dot_path = file_path - self.name = self.__get_model_name() + self.name = model_name or self.__get_model_name() self.__dot_lines = self.__open_dot() self.states, self.initial_state, self.final_states = self.__get_state_variables() self.events = self.__get_event_variables() diff --git a/tools/verification/dot2/dot2c.py b/tools/verification/dot2/dot2c.py index 87d8a1e1470c..fa2816ac7b61 100644 --- a/tools/verification/dot2/dot2c.py +++ b/tools/verification/dot2/dot2c.py @@ -22,8 +22,8 @@ class Dot2c(Automata): struct_automaton_def = "automaton" var_automaton_def = "aut" - def __init__(self, file_path): - super().__init__(file_path) + def __init__(self, file_path, model_name=None): + super().__init__(file_path, model_name) self.line_length = 100 def __buff_to_string(self, buff): diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k index d4d7e52d549e..827b62b8d5e1 100644 --- a/tools/verification/dot2/dot2k +++ b/tools/verification/dot2/dot2k @@ -25,16 +25,12 @@ if __name__ == '__main__': print("Opening and parsing the dot file %s" % params.dot_file) try: - monitor=dot2k(params.dot_file, params.monitor_type) + monitor=dot2k(params.dot_file, params.monitor_type, vars(params)) except Exception as e: print('Error: '+ str(e)) print("Sorry : :-(") sys.exit(1) - # easier than using argparse action. - if params.model_name != None: - print(params.model_name) - print("Writing the monitor into the directory %s" % monitor.name) monitor.print_files() print("Almost done, checklist") diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index c88b3c011706..d48ad86a035a 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -17,17 +17,18 @@ class dot2k(Dot2c): monitor_templates_dir = "dot2/dot2k_templates/" monitor_type = "per_cpu" - def __init__(self, file_path, MonitorType): - super().__init__(file_path) + def __init__(self, file_path, MonitorType, extra_params={}): + super().__init__(file_path, extra_params.get("model_name")) self.monitor_type = self.monitor_types.get(MonitorType) if self.monitor_type is None: - raise Exception("Unknown monitor type: %s" % MonitorType) + raise ValueError("Unknown monitor type: %s" % MonitorType) self.monitor_type = MonitorType self.__fill_rv_templates_dir() self.main_c = self.__open_file(self.monitor_templates_dir + "main.c") self.enum_suffix = "_%s" % self.name + self.description = extra_params.get("description", self.name) or "auto-generated" def __fill_rv_templates_dir(self): @@ -114,6 +115,7 @@ class dot2k(Dot2c): main_c = main_c.replace("%%TRACEPOINT_HANDLERS_SKEL%%", tracepoint_handlers) main_c = main_c.replace("%%TRACEPOINT_ATTACH%%", tracepoint_attach) main_c = main_c.replace("%%TRACEPOINT_DETACH%%", tracepoint_detach) + main_c = main_c.replace("%%DESCRIPTION%%", self.description) return main_c diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c index 4a05fef7f3c7..704617168578 100644 --- a/tools/verification/dot2/dot2k_templates/main.c +++ b/tools/verification/dot2/dot2k_templates/main.c @@ -65,7 +65,7 @@ static void disable_%%MODEL_NAME%%(void) */ static struct rv_monitor rv_%%MODEL_NAME%% = { .name = "%%MODEL_NAME%%", - .description = "auto-generated %%MODEL_NAME%%", + .description = "%%DESCRIPTION%%", .enable = enable_%%MODEL_NAME%%, .disable = disable_%%MODEL_NAME%%, .reset = da_monitor_reset_all_%%MODEL_NAME%%, @@ -88,4 +88,4 @@ module_exit(unregister_%%MODEL_NAME%%); MODULE_LICENSE("GPL"); MODULE_AUTHOR("dot2k: auto-generated"); -MODULE_DESCRIPTION("%%MODEL_NAME%%"); +MODULE_DESCRIPTION("%%MODEL_NAME%%: %%DESCRIPTION%%"); From bc3d482dcc062963e7dc20565be2a887e5fc9a2d Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:49 +0100 Subject: [PATCH 30/56] rv: Simplify manual steps in monitor creation While creating a new monitor in RV, besides generating code from dot2k, there are a few manual steps which can be tedious and error prone, like adding the tracepoints, makefile lines and kconfig. This patch restructures the existing monitors to keep some files in the monitor's folder itself, which can be automatically generated by future versions of dot2k. Monitors have now their own Kconfig and tracepoint snippets. For simplicity, the main tracepoint definition, is moved to the RV directory, it defines only the tracepoint classes and includes the monitor-specific tracepoints, which reside in the monitor directory. Tracepoints and Kconfig no longer need to be copied and adapted from existing ones but only need to be included in the main files. The Makefile remains untouched since there's little advantage in having a separated Makefile for each monitor with a single line and including it in the main RV Makefile. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-6-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 25 ++----------------- kernel/trace/rv/Makefile | 2 ++ kernel/trace/rv/monitors/wip/Kconfig | 12 +++++++++ kernel/trace/rv/monitors/wip/wip.c | 2 +- kernel/trace/rv/monitors/wip/wip_trace.h | 15 +++++++++++ kernel/trace/rv/monitors/wwnr/Kconfig | 11 ++++++++ kernel/trace/rv/monitors/wwnr/wwnr.c | 2 +- kernel/trace/rv/monitors/wwnr/wwnr_trace.h | 16 ++++++++++++ kernel/trace/rv/rv.c | 2 +- .../events/rv.h => kernel/trace/rv/rv_trace.h | 22 +++------------- 10 files changed, 65 insertions(+), 44 deletions(-) create mode 100644 kernel/trace/rv/monitors/wip/Kconfig create mode 100644 kernel/trace/rv/monitors/wip/wip_trace.h create mode 100644 kernel/trace/rv/monitors/wwnr/Kconfig create mode 100644 kernel/trace/rv/monitors/wwnr/wwnr_trace.h rename include/trace/events/rv.h => kernel/trace/rv/rv_trace.h (79%) diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 831779607e84..1cca47531f00 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -25,30 +25,9 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst -config RV_MON_WIP - depends on RV - depends on PREEMPT_TRACER - select DA_MON_EVENTS_IMPLICIT - bool "wip monitor" - help - Enable wip (wakeup in preemptive) sample monitor that illustrates - the usage of per-cpu monitors, and one limitation of the - preempt_disable/enable events. +source "kernel/trace/rv/monitors/wip/Kconfig" - For further information, see: - Documentation/trace/rv/monitor_wip.rst - -config RV_MON_WWNR - depends on RV - select DA_MON_EVENTS_ID - bool "wwnr monitor" - help - Enable wwnr (wakeup while not running) sample monitor, this is a - sample monitor that illustrates the usage of per-task monitor. - The model is borken on purpose: it serves to test reactors. - - For further information, see: - Documentation/trace/rv/monitor_wwnr.rst +source "kernel/trace/rv/monitors/wwnr/Kconfig" config RV_REACTORS bool "Runtime verification reactors" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 963d14875b45..645434146a88 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I $(src) # needed for trace events + obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig new file mode 100644 index 000000000000..3ef664b5cd90 --- /dev/null +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -0,0 +1,12 @@ +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index b2b49a27e886..db7389157c87 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wip" -#include +#include #include #include diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h new file mode 100644 index 000000000000..aa2162f47a4c --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WIP +DEFINE_EVENT(event_da_monitor, event_wip, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_wip, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_WIP */ diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig new file mode 100644 index 000000000000..ee741aa6d6b8 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 0e43dd2db685..3b16994a9984 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wwnr" -#include +#include #include #include "wwnr.h" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h new file mode 100644 index 000000000000..fc97ec7476ad --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WWNR +/* id is the pid of the task */ +DEFINE_EVENT(event_da_monitor_id, event_wwnr, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_wwnr, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_WWNR */ diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 279c70e1bd74..8657fc8806e7 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -145,7 +145,7 @@ #ifdef CONFIG_DA_MON_EVENTS #define CREATE_TRACE_POINTS -#include +#include #endif #include "rv.h" diff --git a/include/trace/events/rv.h b/kernel/trace/rv/rv_trace.h similarity index 79% rename from include/trace/events/rv.h rename to kernel/trace/rv/rv_trace.h index 56592da9301c..3442dc59490f 100644 --- a/include/trace/events/rv.h +++ b/kernel/trace/rv/rv_trace.h @@ -57,15 +57,8 @@ DECLARE_EVENT_CLASS(error_da_monitor, __entry->state) ); -#ifdef CONFIG_RV_MON_WIP -DEFINE_EVENT(event_da_monitor, event_wip, - TP_PROTO(char *state, char *event, char *next_state, bool final_state), - TP_ARGS(state, event, next_state, final_state)); +#include -DEFINE_EVENT(error_da_monitor, error_wip, - TP_PROTO(char *state, char *event), - TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_WIP */ #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ #ifdef CONFIG_DA_MON_EVENTS_ID @@ -123,20 +116,13 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, __entry->state) ); -#ifdef CONFIG_RV_MON_WWNR -/* id is the pid of the task */ -DEFINE_EVENT(event_da_monitor_id, event_wwnr, - TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), - TP_ARGS(id, state, event, next_state, final_state)); - -DEFINE_EVENT(error_da_monitor_id, error_wwnr, - TP_PROTO(int id, char *state, char *event), - TP_ARGS(id, state, event)); -#endif /* CONFIG_RV_MON_WWNR */ +#include #endif /* CONFIG_DA_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ /* This part ust be outside protection */ #undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rv_trace #include From 9c6cfe80980056042f1f80d65c74806021708989 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:50 +0100 Subject: [PATCH 31/56] verification/dot2k: Simplify manual steps in monitor creation This patch reduces and simplifies the manual steps still needed in creating a new RV monitor. It extends the dot2k script to create a tracepoint snippet and a Kconfig file for the newly generated monitor. Those files can be kept in the monitor's directory but shall be included in the main tracepoint header and Kconfig. Together with the checklist, dot2k now suggests the lines to add to those files for inclusion and the Makefile line to compile the new monitor: Writing the monitor into the directory monitor_name Almost done, checklist - Edit the monitor_name/monitor_name.c to add the instrumentation - Edit kernel/trace/rv/rv_trace.h: Add this line where other tracepoints are included and DA_MON_EVENTS_ID is defined: #include - Edit kernel/trace/rv/Makefile: Add this line where other monitors are included: obj-$(CONFIG_RV_MON_MONITOR_NAME) += monitors/monitor_name/monitor_name.o - Edit kernel/trace/rv/Kconfig: Add this line where other monitors are included: source "kernel/trace/rv/monitors/monitor_name/Kconfig" - Move monitor_name/ to the kernel's monitor directory (kernel/trace/rv/monitors) Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-7-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/dot2k | 8 +- tools/verification/dot2/dot2k.py | 86 +++++++++++++++++++ .../verification/dot2/dot2k_templates/Kconfig | 6 ++ .../verification/dot2/dot2k_templates/main.c | 2 +- .../verification/dot2/dot2k_templates/trace.h | 13 +++ 5 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 tools/verification/dot2/dot2k_templates/Kconfig create mode 100644 tools/verification/dot2/dot2k_templates/trace.h diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k index 827b62b8d5e1..190c974edd0a 100644 --- a/tools/verification/dot2/dot2k +++ b/tools/verification/dot2/dot2k @@ -35,7 +35,7 @@ if __name__ == '__main__': monitor.print_files() print("Almost done, checklist") print(" - Edit the %s/%s.c to add the instrumentation" % (monitor.name, monitor.name)) - print(" - Edit include/trace/events/rv.h to add the tracepoint entry") - print(" - Move it to the kernel's monitor directory") - print(" - Edit kernel/trace/rv/Makefile") - print(" - Edit kernel/trace/rv/Kconfig") + print(monitor.fill_tracepoint_tooltip()) + print(monitor.fill_makefile_tooltip()) + print(monitor.fill_kconfig_tooltip()) + print(" - Move %s/ to the kernel's monitor directory (%s/monitors)" % (monitor.name, monitor.rv_dir)) diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index d48ad86a035a..dc56cd1fb0b4 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -15,6 +15,7 @@ import os class dot2k(Dot2c): monitor_types = { "global" : 1, "per_cpu" : 2, "per_task" : 3 } monitor_templates_dir = "dot2/dot2k_templates/" + rv_dir = "kernel/trace/rv" monitor_type = "per_cpu" def __init__(self, file_path, MonitorType, extra_params={}): @@ -27,6 +28,8 @@ class dot2k(Dot2c): self.monitor_type = MonitorType self.__fill_rv_templates_dir() self.main_c = self.__open_file(self.monitor_templates_dir + "main.c") + self.trace_h = self.__open_file(self.monitor_templates_dir + "trace.h") + self.kconfig = self.__open_file(self.monitor_templates_dir + "Kconfig") self.enum_suffix = "_%s" % self.name self.description = extra_params.get("description", self.name) or "auto-generated" @@ -144,6 +147,82 @@ class dot2k(Dot2c): return self.__buff_to_string(buff) + def fill_monitor_class_type(self): + if self.monitor_type == "per_task": + return "DA_MON_EVENTS_ID" + return "DA_MON_EVENTS_IMPLICIT" + + def fill_monitor_class(self): + if self.monitor_type == "per_task": + return "da_monitor_id" + return "da_monitor" + + def fill_tracepoint_args_skel(self, tp_type): + buff = [] + tp_args_event = [ + ("char *", "state"), + ("char *", "event"), + ("char *", "next_state"), + ("bool ", "final_state"), + ] + tp_args_error = [ + ("char *", "state"), + ("char *", "event"), + ] + tp_args_id = ("int ", "id") + tp_args = tp_args_event if tp_type == "event" else tp_args_error + if self.monitor_type == "per_task": + tp_args.insert(0, tp_args_id) + tp_proto_c = ", ".join([a+b for a,b in tp_args]) + tp_args_c = ", ".join([b for a,b in tp_args]) + buff.append(" TP_PROTO(%s)," % tp_proto_c) + buff.append(" TP_ARGS(%s)" % tp_args_c) + return self.__buff_to_string(buff) + + def fill_trace_h(self): + trace_h = self.trace_h + monitor_class = self.fill_monitor_class() + monitor_class_type = self.fill_monitor_class_type() + tracepoint_args_skel_event = self.fill_tracepoint_args_skel("event") + tracepoint_args_skel_error = self.fill_tracepoint_args_skel("error") + trace_h = trace_h.replace("%%MODEL_NAME%%", self.name) + trace_h = trace_h.replace("%%MODEL_NAME_UP%%", self.name.upper()) + trace_h = trace_h.replace("%%MONITOR_CLASS%%", monitor_class) + trace_h = trace_h.replace("%%MONITOR_CLASS_TYPE%%", monitor_class_type) + trace_h = trace_h.replace("%%TRACEPOINT_ARGS_SKEL_EVENT%%", tracepoint_args_skel_event) + trace_h = trace_h.replace("%%TRACEPOINT_ARGS_SKEL_ERROR%%", tracepoint_args_skel_error) + return trace_h + + def fill_kconfig(self): + kconfig = self.kconfig + monitor_class_type = self.fill_monitor_class_type() + kconfig = kconfig.replace("%%MODEL_NAME%%", self.name) + kconfig = kconfig.replace("%%MODEL_NAME_UP%%", self.name.upper()) + kconfig = kconfig.replace("%%MONITOR_CLASS_TYPE%%", monitor_class_type) + kconfig = kconfig.replace("%%DESCRIPTION%%", self.description) + return kconfig + + def fill_tracepoint_tooltip(self): + monitor_class_type = self.fill_monitor_class_type() + return """ - Edit %s/rv_trace.h: +Add this line where other tracepoints are included and %s is defined: +#include +""" % (self.rv_dir, monitor_class_type, self.name, self.name) + + def fill_kconfig_tooltip(self): + return """ - Edit %s/Kconfig: +Add this line where other monitors are included: +source \"kernel/trace/rv/monitors/%s/Kconfig\" +""" % (self.rv_dir, self.name) + + def fill_makefile_tooltip(self): + name = self.name + name_up = name.upper() + return """ - Edit %s/Makefile: +Add this line where other monitors are included: +obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o +""" % (self.rv_dir, name_up, name, name) + def __create_directory(self): try: os.mkdir(self.name) @@ -182,3 +261,10 @@ class dot2k(Dot2c): path = "%s.h" % self.name self.__create_file(path, model_h) + + trace_h = self.fill_trace_h() + path = "%s_trace.h" % self.name + self.__create_file(path, trace_h) + + kconfig = self.fill_kconfig() + self.__create_file("Kconfig", kconfig) diff --git a/tools/verification/dot2/dot2k_templates/Kconfig b/tools/verification/dot2/dot2k_templates/Kconfig new file mode 100644 index 000000000000..90cdc1e9379e --- /dev/null +++ b/tools/verification/dot2/dot2k_templates/Kconfig @@ -0,0 +1,6 @@ +config RV_MON_%%MODEL_NAME_UP%% + depends on RV + select %%MONITOR_CLASS_TYPE%% + bool "%%MODEL_NAME%% monitor" + help + %%DESCRIPTION%% diff --git a/tools/verification/dot2/dot2k_templates/main.c b/tools/verification/dot2/dot2k_templates/main.c index 704617168578..9605ca994416 100644 --- a/tools/verification/dot2/dot2k_templates/main.c +++ b/tools/verification/dot2/dot2k_templates/main.c @@ -14,7 +14,7 @@ * XXX: include required tracepoint headers, e.g., * #include */ -#include +#include /* * This is the self-generated part of the monitor. Generally, there is no need diff --git a/tools/verification/dot2/dot2k_templates/trace.h b/tools/verification/dot2/dot2k_templates/trace.h new file mode 100644 index 000000000000..87d3a1308926 --- /dev/null +++ b/tools/verification/dot2/dot2k_templates/trace.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_%%MODEL_NAME_UP%% +DEFINE_EVENT(event_%%MONITOR_CLASS%%, event_%%MODEL_NAME%%, +%%TRACEPOINT_ARGS_SKEL_EVENT%%); + +DEFINE_EVENT(error_%%MONITOR_CLASS%%, error_%%MODEL_NAME%%, +%%TRACEPOINT_ARGS_SKEL_ERROR%%); +#endif /* CONFIG_RV_MON_%%MODEL_NAME_UP%% */ From de6f45c2dd226269fe9886290a139533c817c5bc Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:51 +0100 Subject: [PATCH 32/56] verification/dot2k: Auto patch current kernel source dot2k suggests a list of changes to the kernel tree while adding a monitor: edit tracepoints header, Makefile, Kconfig and moving the monitor folder. Those changes can be easily run automatically. Add a flag to dot2k to alter the kernel source. The kernel source directory can be either assumed from the PWD, or from the running kernel, if installed. This feature works best if the kernel tree is a git repository, so that its easier to make sure there are no unintended changes. The main RV files (e.g. Makefile) have now a comment placeholder that can be useful for manual editing (e.g. to know where to add new monitors) and it is used by the script to append the required lines. We also slightly adapt the file handling functions in dot2k: __open_file is now called __read_file and also closes the file before returning the content; __create_file is now a more general __write_file, we no longer return on FileExistsError (not thrown while opening), a new __create_file simply calls __write_file specifying the monitor folder in the path. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-8-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 2 +- kernel/trace/rv/Makefile | 1 + kernel/trace/rv/rv_trace.h | 2 + tools/verification/dot2/dot2k | 5 +- tools/verification/dot2/dot2k.py | 92 +++++++++++++++++++++++++++----- 5 files changed, 86 insertions(+), 16 deletions(-) diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 1cca47531f00..8226352a0062 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -26,8 +26,8 @@ menuconfig RV Documentation/trace/rv/runtime-verification.rst source "kernel/trace/rv/monitors/wip/Kconfig" - source "kernel/trace/rv/monitors/wwnr/Kconfig" +# Add new monitors here config RV_REACTORS bool "Runtime verification reactors" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 645434146a88..188b64668e1f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,6 +5,7 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +# Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 3442dc59490f..5e65097423ba 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,6 +58,7 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include +// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -117,6 +118,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, ); #include +// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ diff --git a/tools/verification/dot2/dot2k b/tools/verification/dot2/dot2k index 190c974edd0a..559ba191a1f6 100644 --- a/tools/verification/dot2/dot2k +++ b/tools/verification/dot2/dot2k @@ -21,6 +21,9 @@ if __name__ == '__main__': parser.add_argument('-t', "--monitor_type", dest="monitor_type", required=True) parser.add_argument('-n', "--model_name", dest="model_name", required=False) parser.add_argument("-D", "--description", dest="description", required=False) + parser.add_argument("-a", "--auto_patch", dest="auto_patch", + action="store_true", required=False, + help="Patch the kernel in place") params = parser.parse_args() print("Opening and parsing the dot file %s" % params.dot_file) @@ -38,4 +41,4 @@ if __name__ == '__main__': print(monitor.fill_tracepoint_tooltip()) print(monitor.fill_makefile_tooltip()) print(monitor.fill_kconfig_tooltip()) - print(" - Move %s/ to the kernel's monitor directory (%s/monitors)" % (monitor.name, monitor.rv_dir)) + print(monitor.fill_monitor_tooltip()) diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index dc56cd1fb0b4..83f4d49853a2 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -27,11 +27,14 @@ class dot2k(Dot2c): self.monitor_type = MonitorType self.__fill_rv_templates_dir() - self.main_c = self.__open_file(self.monitor_templates_dir + "main.c") - self.trace_h = self.__open_file(self.monitor_templates_dir + "trace.h") - self.kconfig = self.__open_file(self.monitor_templates_dir + "Kconfig") + self.main_c = self.__read_file(self.monitor_templates_dir + "main.c") + self.trace_h = self.__read_file(self.monitor_templates_dir + "trace.h") + self.kconfig = self.__read_file(self.monitor_templates_dir + "Kconfig") self.enum_suffix = "_%s" % self.name self.description = extra_params.get("description", self.name) or "auto-generated" + self.auto_patch = extra_params.get("auto_patch") + if self.auto_patch: + self.__fill_rv_kernel_dir() def __fill_rv_templates_dir(self): @@ -39,7 +42,7 @@ class dot2k(Dot2c): return if platform.system() != "Linux": - raise Exception("I can only run on Linux.") + raise OSError("I can only run on Linux.") kernel_path = "/lib/modules/%s/build/tools/verification/dot2/dot2k_templates/" % (platform.release()) @@ -51,17 +54,43 @@ class dot2k(Dot2c): self.monitor_templates_dir = "/usr/share/dot2/dot2k_templates/" return - raise Exception("Could not find the template directory, do you have the kernel source installed?") + raise FileNotFoundError("Could not find the template directory, do you have the kernel source installed?") + def __fill_rv_kernel_dir(self): - def __open_file(self, path): + # first try if we are running in the kernel tree root + if os.path.exists(self.rv_dir): + return + + # offset if we are running inside the kernel tree from verification/dot2 + kernel_path = os.path.join("../..", self.rv_dir) + + if os.path.exists(kernel_path): + self.rv_dir = kernel_path + return + + if platform.system() != "Linux": + raise OSError("I can only run on Linux.") + + kernel_path = os.path.join("/lib/modules/%s/build" % platform.release(), self.rv_dir) + + # if the current kernel is from a distro this may not be a full kernel tree + # verify that one of the files we are going to modify is available + if os.path.exists(os.path.join(kernel_path, "rv_trace.h")): + self.rv_dir = kernel_path + return + + raise FileNotFoundError("Could not find the rv directory, do you have the kernel source installed?") + + def __read_file(self, path): try: - fd = open(path) + fd = open(path, 'r') except OSError: raise Exception("Cannot open the file: %s" % path) content = fd.read() + fd.close() return content def __buff_to_string(self, buff): @@ -202,14 +231,32 @@ class dot2k(Dot2c): kconfig = kconfig.replace("%%DESCRIPTION%%", self.description) return kconfig + def __patch_file(self, file, marker, line): + file_to_patch = os.path.join(self.rv_dir, file) + content = self.__read_file(file_to_patch) + content = content.replace(marker, line + "\n" + marker) + self.__write_file(file_to_patch, content) + def fill_tracepoint_tooltip(self): monitor_class_type = self.fill_monitor_class_type() + if self.auto_patch: + self.__patch_file("rv_trace.h", + "// Add new monitors based on CONFIG_%s here" % monitor_class_type, + "#include " % (self.name, self.name)) + return " - Patching %s/rv_trace.h, double check the result" % self.rv_dir + return """ - Edit %s/rv_trace.h: Add this line where other tracepoints are included and %s is defined: #include """ % (self.rv_dir, monitor_class_type, self.name, self.name) def fill_kconfig_tooltip(self): + if self.auto_patch: + self.__patch_file("Kconfig", + "# Add new monitors here", + "source \"kernel/trace/rv/monitors/%s/Kconfig\"" % (self.name)) + return " - Patching %s/Kconfig, double check the result" % self.rv_dir + return """ - Edit %s/Kconfig: Add this line where other monitors are included: source \"kernel/trace/rv/monitors/%s/Kconfig\" @@ -218,32 +265,49 @@ source \"kernel/trace/rv/monitors/%s/Kconfig\" def fill_makefile_tooltip(self): name = self.name name_up = name.upper() + if self.auto_patch: + self.__patch_file("Makefile", + "# Add new monitors here", + "obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o" % (name_up, name, name)) + return " - Patching %s/Makefile, double check the result" % self.rv_dir + return """ - Edit %s/Makefile: Add this line where other monitors are included: obj-$(CONFIG_RV_MON_%s) += monitors/%s/%s.o """ % (self.rv_dir, name_up, name, name) + def fill_monitor_tooltip(self): + if self.auto_patch: + return " - Monitor created in %s/monitors/%s" % (self.rv_dir, self. name) + return " - Move %s/ to the kernel's monitor directory (%s/monitors)" % (self.name, self.rv_dir) + def __create_directory(self): + path = self.name + if self.auto_patch: + path = os.path.join(self.rv_dir, "monitors", path) try: - os.mkdir(self.name) + os.mkdir(path) except FileExistsError: return except: print("Fail creating the output dir: %s" % self.name) - def __create_file(self, file_name, content): - path = "%s/%s" % (self.name, file_name) + def __write_file(self, file_name, content): try: - file = open(path, 'w') - except FileExistsError: - return + file = open(file_name, 'w') except: - print("Fail creating file: %s" % path) + print("Fail writing to file: %s" % file_name) file.write(content) file.close() + def __create_file(self, file_name, content): + path = "%s/%s" % (self.name, file_name) + if self.auto_patch: + path = os.path.join(self.rv_dir, "monitors", path) + self.__write_file(path, content) + def __get_main_name(self): path = "%s/%s" % (self.name, "main.c") if not os.path.exists(path): From 87c5d7f5e5938f713bde4e7435e6b207372a7f8e Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:52 +0100 Subject: [PATCH 33/56] verification/dot2k: Implement event type detection Currently dot2k treats all events equally and registers them with a general da_handle_event. This is however just part of the work because some events are necessary to understand when the monitor is entering the initial state. Specifically, the da_handle_start_event takes care of setting the monitor in the initial state and da_handle_start_run_event also registers the current event in the newly enabled monitor. da_handle_start_event can be used on events that only lead to the initial state (as it is currently done in the example monitors), while da_handle_start_run_event could be used on events that are only valid from the initial one. Failing to set at least one of those functions to handle events makes the monitor useless, since it will never be activated. This patch adapts dot2k to parse the events that surely lead to the initial state and set da_handle_start_event for those, if no such event is found but some events are only valid in the initial event, we instead set da_handle_start_run_event (it isn't necessary to set both). We still add a comment to warn the user to make sure this change is matching the model definition. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-9-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- tools/verification/dot2/automata.py | 32 +++++++++++++++++++++++++++++ tools/verification/dot2/dot2k.py | 11 ++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tools/verification/dot2/automata.py b/tools/verification/dot2/automata.py index f6921cf3c914..d9a3fe2b74bf 100644 --- a/tools/verification/dot2/automata.py +++ b/tools/verification/dot2/automata.py @@ -26,6 +26,7 @@ class Automata: self.states, self.initial_state, self.final_states = self.__get_state_variables() self.events = self.__get_event_variables() self.function = self.__create_matrix() + self.events_start, self.events_start_run = self.__store_init_events() def __get_model_name(self): basename = ntpath.basename(self.__dot_path) @@ -172,3 +173,34 @@ class Automata: cursor += 1 return matrix + + def __store_init_events(self): + events_start = [False] * len(self.events) + events_start_run = [False] * len(self.events) + for i, _ in enumerate(self.events): + curr_event_will_init = 0 + curr_event_from_init = False + curr_event_used = 0 + for j, _ in enumerate(self.states): + if self.function[j][i] != self.invalid_state_str: + curr_event_used += 1 + if self.function[j][i] == self.initial_state: + curr_event_will_init += 1 + if self.function[0][i] != self.invalid_state_str: + curr_event_from_init = True + # this event always leads to init + if curr_event_will_init and curr_event_used == curr_event_will_init: + events_start[i] = True + # this event is only called from init + if curr_event_from_init and curr_event_used == 1: + events_start_run[i] = True + return events_start, events_start_run + + def is_start_event(self, event): + return self.events_start[self.events.index(event)] + + def is_start_run_event(self, event): + # prefer handle_start_event if there + if any(self.events_start): + return False + return self.events_start_run[self.events.index(event)] diff --git a/tools/verification/dot2/dot2k.py b/tools/verification/dot2/dot2k.py index 83f4d49853a2..7547eb290b7d 100644 --- a/tools/verification/dot2/dot2k.py +++ b/tools/verification/dot2/dot2k.py @@ -110,11 +110,18 @@ class dot2k(Dot2c): for event in self.events: buff.append("static void handle_%s(void *data, /* XXX: fill header */)" % event) buff.append("{") + handle = "handle_event" + if self.is_start_event(event): + buff.append("\t/* XXX: validate that this event always leads to the initial state */") + handle = "handle_start_event" + elif self.is_start_run_event(event): + buff.append("\t/* XXX: validate that this event is only valid in the initial state */") + handle = "handle_start_run_event" if self.monitor_type == "per_task": buff.append("\tstruct task_struct *p = /* XXX: how do I get p? */;"); - buff.append("\tda_handle_event_%s(p, %s%s);" % (self.name, event, self.enum_suffix)); + buff.append("\tda_%s_%s(p, %s%s);" % (handle, self.name, event, self.enum_suffix)); else: - buff.append("\tda_handle_event_%s(%s%s);" % (self.name, event, self.enum_suffix)); + buff.append("\tda_%s_%s(%s%s);" % (handle, self.name, event, self.enum_suffix)); buff.append("}") buff.append("") return self.__buff_to_string(buff) From 668d6fd5bc988fc0130ec97eb350f30a56dff39a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:11 +0900 Subject: [PATCH 34/56] jump_label: Define guard() for jump_label_lock Link: https://lore.kernel.org/all/173371207108.480397.12818384744149153972.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/jump_label.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f5a2727ca4a9..fdb79dd1ebd8 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -75,6 +75,7 @@ #include #include +#include extern bool static_key_initialized; @@ -347,6 +348,8 @@ static inline void static_key_disable(struct static_key *key) #endif /* CONFIG_JUMP_LABEL */ +DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) + #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled From d08f1d46d23f11dbe3071b3a332efc9db621e2d6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:26 +0900 Subject: [PATCH 35/56] kprobes: Use guard() for external locks Use guard() for text_mutex, cpu_read_lock, and jump_label_lock in the kprobes. Link: https://lore.kernel.org/all/173371208663.480397.7535769878667655223.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 207 ++++++++++++++++++++--------------------------- 1 file changed, 89 insertions(+), 118 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 62b5b08d809d..004eb8326520 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -596,41 +596,38 @@ static void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static void kprobe_optimizer(struct work_struct *work) { - mutex_lock(&kprobe_mutex); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(mutex)(&kprobe_mutex); - /* - * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) - * kprobes before waiting for quiesence period. - */ - do_unoptimize_kprobes(); + scoped_guard(cpus_read_lock) { + guard(mutex)(&text_mutex); - /* - * Step 2: Wait for quiesence period to ensure all potentially - * preempted tasks to have normally scheduled. Because optprobe - * may modify multiple instructions, there is a chance that Nth - * instruction is preempted. In that case, such tasks can return - * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. - * Note that on non-preemptive kernel, this is transparently converted - * to synchronoze_sched() to wait for all interrupts to have completed. - */ - synchronize_rcu_tasks(); + /* + * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) + * kprobes before waiting for quiesence period. + */ + do_unoptimize_kprobes(); - /* Step 3: Optimize kprobes after quiesence period */ - do_optimize_kprobes(); + /* + * Step 2: Wait for quiesence period to ensure all potentially + * preempted tasks to have normally scheduled. Because optprobe + * may modify multiple instructions, there is a chance that Nth + * instruction is preempted. In that case, such tasks can return + * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. + * Note that on non-preemptive kernel, this is transparently converted + * to synchronoze_sched() to wait for all interrupts to have completed. + */ + synchronize_rcu_tasks(); - /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(); + /* Step 3: Optimize kprobes after quiesence period */ + do_optimize_kprobes(); - mutex_unlock(&text_mutex); - cpus_read_unlock(); + /* Step 4: Free cleaned kprobes after quiesence period */ + do_free_cleaned_kprobes(); + } /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); - - mutex_unlock(&kprobe_mutex); } static void wait_for_kprobe_optimizer_locked(void) @@ -853,29 +850,24 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called. */ - cpus_read_lock(); - jump_label_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(jump_label_lock)(); + guard(mutex)(&text_mutex); ap = alloc_aggr_kprobe(p); if (!ap) - goto out; + return; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe. */ arch_remove_optimized_kprobe(op); kfree(op); - goto out; + return; } init_aggr_kprobe(ap, p); optimize_kprobe(ap); /* This just kicks optimizer thread. */ - -out: - mutex_unlock(&text_mutex); - jump_label_unlock(); - cpus_read_unlock(); } static void optimize_all_kprobes(void) @@ -1158,12 +1150,9 @@ static int arm_kprobe(struct kprobe *kp) if (unlikely(kprobe_ftrace(kp))) return arm_kprobe_ftrace(kp); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&text_mutex); __arm_kprobe(kp); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - return 0; } @@ -1172,12 +1161,9 @@ static int disarm_kprobe(struct kprobe *kp, bool reopt) if (unlikely(kprobe_ftrace(kp))) return disarm_kprobe_ftrace(kp); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&text_mutex); __disarm_kprobe(kp, reopt); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - return 0; } @@ -1294,63 +1280,56 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; - cpus_read_lock(); + scoped_guard(cpus_read_lock) { + /* For preparing optimization, jump_label_text_reserved() is called */ + guard(jump_label_lock)(); + guard(mutex)(&text_mutex); - /* For preparing optimization, jump_label_text_reserved() is called */ - jump_label_lock(); - mutex_lock(&text_mutex); - - if (!kprobe_aggrprobe(orig_p)) { - /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ - ap = alloc_aggr_kprobe(orig_p); - if (!ap) { - ret = -ENOMEM; - goto out; + if (!kprobe_aggrprobe(orig_p)) { + /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ + ap = alloc_aggr_kprobe(orig_p); + if (!ap) + return -ENOMEM; + init_aggr_kprobe(ap, orig_p); + } else if (kprobe_unused(ap)) { + /* This probe is going to die. Rescue it */ + ret = reuse_unused_kprobe(ap); + if (ret) + return ret; } - init_aggr_kprobe(ap, orig_p); - } else if (kprobe_unused(ap)) { - /* This probe is going to die. Rescue it */ - ret = reuse_unused_kprobe(ap); - if (ret) - goto out; - } - if (kprobe_gone(ap)) { - /* - * Attempting to insert new probe at the same location that - * had a probe in the module vaddr area which already - * freed. So, the instruction slot has already been - * released. We need a new slot for the new probe. - */ - ret = arch_prepare_kprobe(ap); - if (ret) + if (kprobe_gone(ap)) { /* - * Even if fail to allocate new slot, don't need to - * free the 'ap'. It will be used next time, or - * freed by unregister_kprobe(). + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. */ - goto out; + ret = arch_prepare_kprobe(ap); + if (ret) + /* + * Even if fail to allocate new slot, don't need to + * free the 'ap'. It will be used next time, or + * freed by unregister_kprobe(). + */ + return ret; - /* Prepare optimized instructions if possible. */ - prepare_optimized_kprobe(ap); + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); - /* - * Clear gone flag to prevent allocating new slot again, and - * set disabled flag because it is not armed yet. - */ - ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) - | KPROBE_FLAG_DISABLED; + /* + * Clear gone flag to prevent allocating new slot again, and + * set disabled flag because it is not armed yet. + */ + ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) + | KPROBE_FLAG_DISABLED; + } + + /* Copy the insn slot of 'p' to 'ap'. */ + copy_kprobe(ap, p); + ret = add_new_kprobe(ap, p); } - /* Copy the insn slot of 'p' to 'ap'. */ - copy_kprobe(ap, p); - ret = add_new_kprobe(ap, p); - -out: - mutex_unlock(&text_mutex); - jump_label_unlock(); - cpus_read_unlock(); - if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) { @@ -1559,26 +1538,23 @@ static int check_kprobe_address_safe(struct kprobe *p, ret = check_ftrace_location(p); if (ret) return ret; - jump_label_lock(); + + guard(jump_label_lock)(); /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { guard(preempt)(); *probed_mod = __module_text_address((unsigned long) p->addr); - if (!(*probed_mod)) { - ret = -EINVAL; - goto out; - } + if (!(*probed_mod)) + return -EINVAL; /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(*probed_mod))) { - ret = -ENOENT; - goto out; - } + if (unlikely(!try_module_get(*probed_mod))) + return -ENOENT; } /* Ensure it is not in reserved area. */ if (in_gate_area_no_mm((unsigned long) p->addr) || @@ -1588,8 +1564,7 @@ static int check_kprobe_address_safe(struct kprobe *p, find_bug((unsigned long)p->addr) || is_cfi_preamble_symbol((unsigned long)p->addr)) { module_put(*probed_mod); - ret = -EINVAL; - goto out; + return -EINVAL; } /* Get module refcount and reject __init functions for loaded modules. */ @@ -1601,14 +1576,11 @@ static int check_kprobe_address_safe(struct kprobe *p, if (within_module_init((unsigned long)p->addr, *probed_mod) && !module_is_coming(*probed_mod)) { module_put(*probed_mod); - ret = -ENOENT; + return -ENOENT; } } -out: - jump_label_unlock(); - - return ret; + return 0; } static int __register_kprobe(struct kprobe *p) @@ -1623,14 +1595,13 @@ static int __register_kprobe(struct kprobe *p) /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ return register_aggr_kprobe(old_p, p); - cpus_read_lock(); - /* Prevent text modification */ - mutex_lock(&text_mutex); - ret = prepare_kprobe(p); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - if (ret) - return ret; + scoped_guard(cpus_read_lock) { + /* Prevent text modification */ + guard(mutex)(&text_mutex); + ret = prepare_kprobe(p); + if (ret) + return ret; + } INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, From b53506351b6ce8a467e03cb1d2145e9038b43936 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:38 +0900 Subject: [PATCH 36/56] kprobes: Use guard for rcu_read_lock Use guard(rcu) for rcu_read_lock so that it can remove unneeded gotos and make it more structured. Link: https://lore.kernel.org/all/173371209846.480397.3852648910271029695.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 64 ++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 004eb8326520..a24587e8f91a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -144,30 +144,26 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) /* Since the slot array is not protected by rcu, we need a mutex */ guard(mutex)(&c->mutex); - retry: - rcu_read_lock(); - list_for_each_entry_rcu(kip, &c->pages, list) { - if (kip->nused < slots_per_page(c)) { - int i; + do { + guard(rcu)(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { + int i; - for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_CLEAN) { - kip->slot_used[i] = SLOT_USED; - kip->nused++; - rcu_read_unlock(); - return kip->insns + (i * c->insn_size); + for (i = 0; i < slots_per_page(c); i++) { + if (kip->slot_used[i] == SLOT_CLEAN) { + kip->slot_used[i] = SLOT_USED; + kip->nused++; + return kip->insns + (i * c->insn_size); + } } + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); } - /* kip->nused is broken. Fix it. */ - kip->nused = slots_per_page(c); - WARN_ON(1); } - } - rcu_read_unlock(); - /* If there are any garbage slots, collect it and try again. */ - if (c->nr_garbage && collect_garbage_slots(c) == 0) - goto retry; + } while (c->nr_garbage && collect_garbage_slots(c) == 0); /* All out of space. Need to allocate a new page. */ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL); @@ -246,25 +242,35 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -void __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +static long __find_insn_page(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, struct kprobe_insn_page **pkip) { - struct kprobe_insn_page *kip; + struct kprobe_insn_page *kip = NULL; long idx; - guard(mutex)(&c->mutex); - rcu_read_lock(); + guard(rcu)(); list_for_each_entry_rcu(kip, &c->pages, list) { idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) - goto out; + if (idx >= 0 && idx < slots_per_page(c)) { + *pkip = kip; + return idx; + } } /* Could not find this slot. */ WARN_ON(1); - kip = NULL; -out: - rcu_read_unlock(); + *pkip = NULL; + return -1; +} + +void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) +{ + struct kprobe_insn_page *kip = NULL; + long idx; + + guard(mutex)(&c->mutex); + idx = __find_insn_page(c, slot, &kip); /* Mark and sweep: this may sleep */ if (kip) { /* Check double free */ From 8d60a731e49346d9d2b94b175093773e1b19b32e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:52 +0900 Subject: [PATCH 37/56] kprobes: Remove unneeded goto Remove unneeded gotos. Since the labels referred by these gotos have only one reference for each, we can replace those gotos with the referred code. Link: https://lore.kernel.org/all/173371211203.480397.13988907319659165160.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a24587e8f91a..34cbbb2206f4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1071,20 +1071,18 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, if (*cnt == 0) { ret = register_ftrace_function(ops); - if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) - goto err_ftrace; + if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) { + /* + * At this point, sinec ops is not registered, we should be sefe from + * registering empty filter. + */ + ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); + return ret; + } } (*cnt)++; return ret; - -err_ftrace: - /* - * At this point, sinec ops is not registered, we should be sefe from - * registering empty filter. - */ - ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); - return ret; } static int arm_kprobe_ftrace(struct kprobe *p) @@ -1428,7 +1426,7 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, unsigned long offset, bool *on_func_entry) { if ((symbol_name && addr) || (!symbol_name && !addr)) - goto invalid; + return ERR_PTR(-EINVAL); if (symbol_name) { /* @@ -1458,11 +1456,10 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, * at the start of the function. */ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry); - if (addr) - return addr; + if (!addr) + return ERR_PTR(-EINVAL); -invalid: - return ERR_PTR(-EINVAL); + return addr; } static kprobe_opcode_t *kprobe_addr(struct kprobe *p) @@ -1486,15 +1483,15 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) if (unlikely(!ap)) return NULL; - if (p != ap) { - list_for_each_entry(list_p, &ap->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return ap; + if (p == ap) + return ap; + + list_for_each_entry(list_p, &ap->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + return ap; + + return NULL; } /* From 5965d3949a7abb836c576735437bbf9436e998a8 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:42:04 +0900 Subject: [PATCH 38/56] kprobes: Remove remaining gotos Remove remaining gotos from kprobes.c to clean up the code. This does not use cleanup macros, but changes code flow for avoiding gotos. Link: https://lore.kernel.org/all/173371212474.480397.5684523564137819115.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 65 ++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 34cbbb2206f4..030569210670 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1730,29 +1730,31 @@ static int __unregister_kprobe_top(struct kprobe *p) if (IS_ERR(ap)) return PTR_ERR(ap); - if (ap == p) - /* - * This probe is an independent(and non-optimized) kprobe - * (not an aggrprobe). Remove from the hash list. - */ - goto disarmed; + WARN_ON(ap != p && !kprobe_aggrprobe(ap)); - /* Following process expects this probe is an aggrprobe */ - WARN_ON(!kprobe_aggrprobe(ap)); - - if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) + /* + * If the probe is an independent(and non-optimized) kprobe + * (not an aggrprobe), the last kprobe on the aggrprobe, or + * kprobe is already disarmed, just remove from the hash list. + */ + if (ap == p || + (list_is_singular(&ap->list) && kprobe_disarmed(ap))) { /* * !disarmed could be happen if the probe is under delayed * unoptimizing. */ - goto disarmed; - else { - /* If disabling probe has special handlers, update aggrprobe */ - if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry(list_p, &ap->list, list) { - if ((list_p != p) && (list_p->post_handler)) - goto noclean; - } + hlist_del_rcu(&ap->hlist); + return 0; + } + + /* If disabling probe has special handlers, update aggrprobe */ + if (p->post_handler && !kprobe_gone(p)) { + list_for_each_entry(list_p, &ap->list, list) { + if ((list_p != p) && (list_p->post_handler)) + break; + } + /* No other probe has post_handler */ + if (list_entry_is_head(list_p, &ap->list, list)) { /* * For the kprobe-on-ftrace case, we keep the * post_handler setting to identify this aggrprobe @@ -1761,24 +1763,21 @@ static int __unregister_kprobe_top(struct kprobe *p) if (!kprobe_ftrace(ap)) ap->post_handler = NULL; } -noclean: - /* - * Remove from the aggrprobe: this path will do nothing in - * __unregister_kprobe_bottom(). - */ - list_del_rcu(&p->list); - if (!kprobe_disabled(ap) && !kprobes_all_disarmed) - /* - * Try to optimize this probe again, because post - * handler may have been changed. - */ - optimize_kprobe(ap); } + + /* + * Remove from the aggrprobe: this path will do nothing in + * __unregister_kprobe_bottom(). + */ + list_del_rcu(&p->list); + if (!kprobe_disabled(ap) && !kprobes_all_disarmed) + /* + * Try to optimize this probe again, because post + * handler may have been changed. + */ + optimize_kprobe(ap); return 0; -disarmed: - hlist_del_rcu(&ap->hlist); - return 0; } static void __unregister_kprobe_bottom(struct kprobe *p) From 22bec11a569983f39c6061cb82279e7de9e3bdfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jan 2025 11:11:43 -0500 Subject: [PATCH 39/56] tracing: Fix using ret variable in tracing_set_tracer() When the function tracing_set_tracer() switched over to using the guard() infrastructure, it did not need to save the 'ret' variable and would just return the value when an error arised, instead of setting ret and jumping to an out label. When CONFIG_TRACER_SNAPSHOT is enabled, it had code that expected the "ret" variable to be initialized to zero and had set 'ret' while holding an arch_spin_lock() (not used by guard), and then upon releasing the lock it would check 'ret' and exit if set. But because ret was only set when an error occurred while holding the locks, 'ret' would be used uninitialized if there was no error. The code in the CONFIG_TRACER_SNAPSHOT block should be self contain. Make sure 'ret' is also set when no error occurred. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250106111143.2f90ff65@gandalf.local.home Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202412271654.nJVBuwmF-lkp@intel.com/ Fixes: d33b10c0c73ad ("tracing: Switch trace.c code over to use guard()") Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aaf442271e9..5aeb898054e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6104,8 +6104,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; + ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) From 1bd13edbbed6e7e396f1aab92b224a4775218e68 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:07:57 +0900 Subject: [PATCH 40/56] tracing/hist: Add poll(POLLIN) support on hist file Add poll syscall support on the `hist` file. The Waiter will be waken up when the histogram is updated with POLLIN. Currently, there is no way to wait for a specific event in userspace. So user needs to peek the `trace` periodicaly, or wait on `trace_pipe`. But it is not a good idea to peek at the `trace` for an event that randomly happens. And `trace_pipe` is not coming back until a page is filled with events. This allows a user to wait for a specific event on the `hist` file. User can set a histogram trigger on the event which they want to monitor and poll() on its `hist` file. Since this poll() returns POLLIN, the next poll() will return soon unless a read() happens on that hist file. NOTE: To read the hist file again, you must set the file offset to 0, but just for monitoring the event, you may not need to read the histogram. Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527247756.464571.14236296701625509931.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 14 +++++++ kernel/trace/trace_events.c | 14 +++++++ kernel/trace/trace_events_hist.c | 70 ++++++++++++++++++++++++++++++-- 3 files changed, 95 insertions(+), 3 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..02cde1174487 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,6 +673,20 @@ struct trace_event_file { atomic_t tm_ref; /* trigger-mode reference counter */ }; +#ifdef CONFIG_HIST_TRIGGERS +extern struct irq_work hist_poll_work; +extern wait_queue_head_t hist_poll_wq; + +static inline void hist_poll_wakeup(void) +{ + if (wq_has_sleeper(&hist_poll_wq)) + irq_work_queue(&hist_poll_work); +} + +#define hist_poll_wait(file, wait) \ + poll_wait(file, &hist_poll_wq, wait) +#endif + #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 047d2775184b..2b9222e7bd5a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3094,6 +3094,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 879b58892b9d..af4be28f01e0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5311,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data, if (resolve_var_refs(hist_data, key, var_ref_vals, true)) hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + + hist_poll_wakeup(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5590,15 +5592,36 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; int n = 0; guard(mutex)(&event_mutex); - event_file = event_file_file(m->private); + event_file = event_file_file(hist_file->file); if (unlikely(!event_file)) return -ENODEV; @@ -5606,27 +5629,68 @@ static int hist_show(struct seq_file *m, void *v) if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + return 0; } +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + if (hist_file->last_read != get_hist_hit_count(event_file)) + return EPOLLIN | EPOLLRDNORM; + + return 0; +} + +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); + if (!hist_file) + return -ENOMEM; + hist_file->file = file; + /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_show, file); + ret = single_open(file, hist_show, hist_file); + if (ret) + kfree(hist_file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG From 66fc6f521a0b91051ce6968a216a30bc52267bf8 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:08:07 +0900 Subject: [PATCH 41/56] tracing/hist: Support POLLPRI event for poll on histogram Since POLLIN will not be flushed until the hist file is read, the user needs to repeatedly read() and poll() on the hist file for monitoring the event continuously. But the read() is somewhat redundant when the user is only monitoring for event updates. Add POLLPRI poll event on the hist file so the event returns when a histogram is updated after open(), poll() or read(). Thus it is possible to wait for the next event without having to issue a read(). Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527248770.464571.2536902137325258133.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index af4be28f01e0..261163b00137 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5595,6 +5595,7 @@ static void hist_trigger_show(struct seq_file *m, struct hist_file_data { struct file *file; u64 last_read; + u64 last_act; }; static u64 get_hist_hit_count(struct trace_event_file *event_file) @@ -5630,6 +5631,11 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } hist_file->last_read = get_hist_hit_count(event_file); + /* + * Update last_act too so that poll()/POLLPRI can wait for the next + * event after any syscall on hist file. + */ + hist_file->last_act = hist_file->last_read; return 0; } @@ -5639,6 +5645,8 @@ static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wai struct trace_event_file *event_file; struct seq_file *m = file->private_data; struct hist_file_data *hist_file = m->private; + __poll_t ret = 0; + u64 cnt; guard(mutex)(&event_mutex); @@ -5648,10 +5656,15 @@ static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wai hist_poll_wait(file, wait); - if (hist_file->last_read != get_hist_hit_count(event_file)) - return EPOLLIN | EPOLLRDNORM; + cnt = get_hist_hit_count(event_file); + if (hist_file->last_read != cnt) + ret |= EPOLLIN | EPOLLRDNORM; + if (hist_file->last_act != cnt) { + hist_file->last_act = cnt; + ret |= EPOLLPRI; + } - return 0; + return ret; } static int event_hist_release(struct inode *inode, struct file *file) @@ -5665,6 +5678,7 @@ static int event_hist_release(struct inode *inode, struct file *file) static int event_hist_open(struct inode *inode, struct file *file) { + struct trace_event_file *event_file; struct hist_file_data *hist_file; int ret; @@ -5672,16 +5686,25 @@ static int event_hist_open(struct inode *inode, struct file *file) if (ret) return ret; + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return -ENODEV; + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); if (!hist_file) return -ENOMEM; + hist_file->file = file; + hist_file->last_act = get_hist_hit_count(event_file); /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; ret = single_open(file, hist_show, hist_file); if (ret) kfree(hist_file); + return ret; } From 80c3e28528ff9f269937fcfe73895213a2e14905 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 29 Dec 2024 22:24:39 +0900 Subject: [PATCH 42/56] selftests/tracing: Add hist poll() support test Add a testcase for poll() on hist file. This introduces a helper binary to the ftracetest, because there is no good way to reliably execute poll() on hist file. Cc: Shuah Khan Cc: Tom Zanussi Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173547867935.569911.10127126796879854182.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ftrace/Makefile | 2 + tools/testing/selftests/ftrace/poll.c | 74 +++++++++++++++++++ .../test.d/trigger/trigger-hist-poll.tc | 74 +++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 tools/testing/selftests/ftrace/poll.c create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile index a1e955d2de4c..49d96bb16355 100644 --- a/tools/testing/selftests/ftrace/Makefile +++ b/tools/testing/selftests/ftrace/Makefile @@ -6,4 +6,6 @@ TEST_PROGS := ftracetest-ktap TEST_FILES := test.d settings EXTRA_CLEAN := $(OUTPUT)/logs/* +TEST_GEN_PROGS = poll + include ../lib.mk diff --git a/tools/testing/selftests/ftrace/poll.c b/tools/testing/selftests/ftrace/poll.c new file mode 100644 index 000000000000..53258f7515e7 --- /dev/null +++ b/tools/testing/selftests/ftrace/poll.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple poll on a file. + * + * Copyright (c) 2024 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define BUFSIZE 4096 + +/* + * Usage: + * poll [-I|-P] [-t timeout] FILE + */ +int main(int argc, char *argv[]) +{ + struct pollfd pfd = {.events = POLLIN}; + char buf[BUFSIZE]; + int timeout = -1; + int ret, opt; + + while ((opt = getopt(argc, argv, "IPt:")) != -1) { + switch (opt) { + case 'I': + pfd.events = POLLIN; + break; + case 'P': + pfd.events = POLLPRI; + break; + case 't': + timeout = atoi(optarg); + break; + default: + fprintf(stderr, "Usage: %s [-I|-P] [-t timeout] FILE\n", + argv[0]); + return -1; + } + } + if (optind >= argc) { + fprintf(stderr, "Error: Polling file is not specified\n"); + return -1; + } + + pfd.fd = open(argv[optind], O_RDONLY); + if (pfd.fd < 0) { + fprintf(stderr, "failed to open %s", argv[optind]); + perror("open"); + return -1; + } + + /* Reset poll by read if POLLIN is specified. */ + if (pfd.events & POLLIN) + do {} while (read(pfd.fd, buf, BUFSIZE) == BUFSIZE); + + ret = poll(&pfd, 1, timeout); + if (ret < 0 && errno != EINTR) { + perror("poll"); + return -1; + } + close(pfd.fd); + + /* If timeout happned (ret == 0), exit code is 1 */ + if (ret == 0) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc new file mode 100644 index 000000000000..8d275e3238d9 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-poll.tc @@ -0,0 +1,74 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test poll wait on histogram +# requires: set_event events/sched/sched_process_free/trigger events/sched/sched_process_free/hist +# flags: instance + +POLL=${FTRACETEST_ROOT}/poll + +if [ ! -x ${POLL} ]; then + echo "poll program is not compiled!" + exit_unresolved +fi + +EVENT=events/sched/sched_process_free/ + +# Check poll ops is supported. Before implementing poll on hist file, it +# returns soon with POLLIN | POLLOUT, but not POLLPRI. + +# This must wait >1 sec and return 1 (timeout). +set +e +${POLL} -I -t 1000 ${EVENT}/hist +ret=$? +set -e +if [ ${ret} != 1 ]; then + echo "poll on hist file is not supported" + exit_unsupported +fi + +# Test POLLIN +echo > trace +echo 'hist:key=comm if comm =="sleep"' > ${EVENT}/trigger +echo 1 > ${EVENT}/enable + +# This sleep command will exit after 2 seconds. +sleep 2 & +BGPID=$! +# if timeout happens, poll returns 1. +${POLL} -I -t 4000 ${EVENT}/hist +echo 0 > tracing_on + +if [ -d /proc/${BGPID} ]; then + echo "poll exits too soon" + kill -KILL ${BGPID} ||: + exit_fail +fi + +if ! grep -qw "sleep" trace; then + echo "poll exits before event happens" + exit_fail +fi + +# Test POLLPRI +echo > trace +echo 1 > tracing_on + +# This sleep command will exit after 2 seconds. +sleep 2 & +BGPID=$! +# if timeout happens, poll returns 1. +${POLL} -P -t 4000 ${EVENT}/hist +echo 0 > tracing_on + +if [ -d /proc/${BGPID} ]; then + echo "poll exits too soon" + kill -KILL ${BGPID} ||: + exit_fail +fi + +if ! grep -qw "sleep" trace; then + echo "poll exits before event happens" + exit_fail +fi + +exit_pass From 28b24394c6e9a3166fcb4480cba054562526657c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:12 -0500 Subject: [PATCH 43/56] scripts/sorttable: Remove unused macro defines The code of sorttable.h was copied from the recordmcount.h which defined a bunch of Elf MACROs so that they could be used between 32bit and 64bit functions. But there's several MACROs that sorttable.h does not use but was copied over. Remove them to clean up the code. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162344.128870118@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.h | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index a7c5445baf00..14d0c4d843e8 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -27,19 +27,10 @@ #undef Elf_Ehdr #undef Elf_Shdr #undef Elf_Rel -#undef Elf_Rela #undef Elf_Sym -#undef ELF_R_SYM -#undef Elf_r_sym -#undef ELF_R_INFO -#undef Elf_r_info -#undef ELF_ST_BIND #undef ELF_ST_TYPE -#undef fn_ELF_R_SYM -#undef fn_ELF_R_INFO #undef uint_t #undef _r -#undef _w #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -52,19 +43,10 @@ # define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr # define Elf_Rel Elf64_Rel -# define Elf_Rela Elf64_Rela # define Elf_Sym Elf64_Sym -# define ELF_R_SYM ELF64_R_SYM -# define Elf_r_sym Elf64_r_sym -# define ELF_R_INFO ELF64_R_INFO -# define Elf_r_info Elf64_r_info -# define ELF_ST_BIND ELF64_ST_BIND # define ELF_ST_TYPE ELF64_ST_TYPE -# define fn_ELF_R_SYM fn_ELF64_R_SYM -# define fn_ELF_R_INFO fn_ELF64_R_INFO # define uint_t uint64_t # define _r r8 -# define _w w8 #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -76,19 +58,10 @@ # define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr # define Elf_Rel Elf32_Rel -# define Elf_Rela Elf32_Rela # define Elf_Sym Elf32_Sym -# define ELF_R_SYM ELF32_R_SYM -# define Elf_r_sym Elf32_r_sym -# define ELF_R_INFO ELF32_R_INFO -# define Elf_r_info Elf32_r_info -# define ELF_ST_BIND ELF32_ST_BIND # define ELF_ST_TYPE ELF32_ST_TYPE -# define fn_ELF_R_SYM fn_ELF32_R_SYM -# define fn_ELF_R_INFO fn_ELF32_R_INFO # define uint_t uint32_t # define _r r -# define _w w #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) From 4f48a28b37d594dab38092514a42ae9f4b781553 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:13 -0500 Subject: [PATCH 44/56] scripts/sorttable: Remove unused write functions The code of sorttable.h was copied from the recordmcount.h which defined various write functions for different sizes (2, 4, 8 byte lengths). But sorttable only uses the 4 byte writes. Remove the extra versions as they are not used. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162344.314385504@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 83cdb843d92f..4dcdbf7a5e26 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -68,8 +68,6 @@ static uint32_t (*r)(const uint32_t *); static uint16_t (*r2)(const uint16_t *); static uint64_t (*r8)(const uint64_t *); static void (*w)(uint32_t, uint32_t *); -static void (*w2)(uint16_t, uint16_t *); -static void (*w8)(uint64_t, uint64_t *); typedef void (*table_sort_t)(char *, int); /* @@ -146,31 +144,11 @@ static void wbe(uint32_t val, uint32_t *x) put_unaligned_be32(val, x); } -static void w2be(uint16_t val, uint16_t *x) -{ - put_unaligned_be16(val, x); -} - -static void w8be(uint64_t val, uint64_t *x) -{ - put_unaligned_be64(val, x); -} - static void wle(uint32_t val, uint32_t *x) { put_unaligned_le32(val, x); } -static void w2le(uint16_t val, uint16_t *x) -{ - put_unaligned_le16(val, x); -} - -static void w8le(uint64_t val, uint64_t *x) -{ - put_unaligned_le64(val, x); -} - /* * Move reserved section indices SHN_LORESERVE..SHN_HIRESERVE out of * the way to -256..-1, to avoid conflicting with real section @@ -277,16 +255,12 @@ static int do_file(char const *const fname, void *addr) r2 = r2le; r8 = r8le; w = wle; - w2 = w2le; - w8 = w8le; break; case ELFDATA2MSB: r = rbe; r2 = r2be; r8 = r8be; w = wbe; - w2 = w2be; - w8 = w8be; break; default: fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", From 6f2c2f93a190467cebd6ebd03feb49514fead5ca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:14 -0500 Subject: [PATCH 45/56] scripts/sorttable: Remove unneeded Elf_Rel The code had references to initialize the Elf_Rel relocation tables, but it was never used. Remove it. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162344.515342233@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.h | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 14d0c4d843e8..18d07fdb2716 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -26,7 +26,6 @@ #undef Elf_Addr #undef Elf_Ehdr #undef Elf_Shdr -#undef Elf_Rel #undef Elf_Sym #undef ELF_ST_TYPE #undef uint_t @@ -42,7 +41,6 @@ # define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr -# define Elf_Rel Elf64_Rel # define Elf_Sym Elf64_Sym # define ELF_ST_TYPE ELF64_ST_TYPE # define uint_t uint64_t @@ -57,7 +55,6 @@ # define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr -# define Elf_Rel Elf32_Rel # define Elf_Sym Elf32_Sym # define ELF_ST_TYPE ELF32_ST_TYPE # define uint_t uint32_t @@ -248,14 +245,10 @@ static int do_sort(Elf_Ehdr *ehdr, Elf32_Word *symtab_shndx = NULL; Elf_Sym *sort_needed_sym = NULL; Elf_Shdr *sort_needed_sec; - Elf_Rel *relocs = NULL; - int relocs_size = 0; uint32_t *sort_needed_loc; const char *secstrings; const char *strtab; char *extab_image; - int extab_index = 0; - int i; int idx; unsigned int shnum; unsigned int shstrndx; @@ -279,23 +272,15 @@ static int do_sort(Elf_Ehdr *ehdr, if (shnum == SHN_UNDEF) shnum = _r(&shdr[0].sh_size); - for (i = 0, s = shdr; s < shdr + shnum; i++, s++) { + for (s = shdr; s < shdr + shnum; s++) { idx = r(&s->sh_name); - if (!strcmp(secstrings + idx, "__ex_table")) { + if (!strcmp(secstrings + idx, "__ex_table")) extab_sec = s; - extab_index = i; - } if (!strcmp(secstrings + idx, ".symtab")) symtab_sec = s; if (!strcmp(secstrings + idx, ".strtab")) strtab_sec = s; - if ((r(&s->sh_type) == SHT_REL || - r(&s->sh_type) == SHT_RELA) && - r(&s->sh_info) == extab_index) { - relocs = (void *)ehdr + _r(&s->sh_offset); - relocs_size = _r(&s->sh_size); - } if (r(&s->sh_type) == SHT_SYMTAB_SHNDX) symtab_shndx = (Elf32_Word *)((const char *)ehdr + _r(&s->sh_offset)); @@ -397,10 +382,6 @@ static int do_sort(Elf_Ehdr *ehdr, extable_ent_size, compare_extable); } - /* If there were relocations, we no longer need them. */ - if (relocs) - memset(relocs, 0, relocs_size); - /* find the flag main_extable_sort_needed */ for (sym = (void *)ehdr + _r(&symtab_sec->sh_offset); sym < sym + _r(&symtab_sec->sh_size) / sizeof(Elf_Sym); From 66990c003306c240d570b3ba274ec4f68cf18c91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:15 -0500 Subject: [PATCH 46/56] scripts/sorttable: Have the ORC code use the _r() functions to read The ORC code reads the section information directly from the file. This currently works because the default read function is for 64bit little endian machines. But if for some reason that ever changes, this will break. Instead of having a surprise breakage, use the _r() functions that will read the values from the file properly. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162344.721480386@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 18d07fdb2716..58f7ab5f5644 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -299,14 +299,14 @@ static int do_sort(Elf_Ehdr *ehdr, #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* locate the ORC unwind tables */ if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { - orc_ip_size = s->sh_size; + orc_ip_size = _r(&s->sh_size); g_orc_ip_table = (int *)((void *)ehdr + - s->sh_offset); + _r(&s->sh_offset)); } if (!strcmp(secstrings + idx, ".orc_unwind")) { - orc_size = s->sh_size; + orc_size = _r(&s->sh_size); g_orc_table = (struct orc_entry *)((void *)ehdr + - s->sh_offset); + _r(&s->sh_offset)); } #endif } /* for loop */ From 7ffc0d0819f438779ed592e2e2e3576f43ce14f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:16 -0500 Subject: [PATCH 47/56] scripts/sorttable: Make compare_extable() into two functions Instead of having the compare_extable() part of the sorttable.h header where it get's defined twice, since it is a very simple function, just define it twice in sorttable.c, and then it can use the proper read functions for the word size and endianess and the Elf_Addr macro can be removed from sorttable.h. Also add a micro optimization. Instead of: if (a < b) return -1; if (a > b) return 1; return 0; That can be shorten to: if (a < b) return -1; return a > b; Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162344.945299671@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 20 ++++++++++++++++++++ scripts/sorttable.h | 14 -------------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 4dcdbf7a5e26..3e2c17e91485 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -173,6 +173,26 @@ static inline unsigned int get_secindex(unsigned int shndx, return r(&symtab_shndx_start[sym_offs]); } +static int compare_extable_32(const void *a, const void *b) +{ + Elf32_Addr av = r(a); + Elf32_Addr bv = r(b); + + if (av < bv) + return -1; + return av > bv; +} + +static int compare_extable_64(const void *a, const void *b) +{ + Elf64_Addr av = r8(a); + Elf64_Addr bv = r8(b); + + if (av < bv) + return -1; + return av > bv; +} + /* 32 bit and 64 bit are very similar */ #include "sorttable.h" #define SORTTABLE_64 diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 58f7ab5f5644..36655ff16b39 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,7 +23,6 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef Elf_Addr #undef Elf_Ehdr #undef Elf_Shdr #undef Elf_Sym @@ -38,7 +37,6 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr # define Elf_Sym Elf64_Sym @@ -52,7 +50,6 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr # define Elf_Sym Elf32_Sym @@ -160,17 +157,6 @@ static void *sort_orctable(void *arg) } #endif -static int compare_extable(const void *a, const void *b) -{ - Elf_Addr av = _r(a); - Elf_Addr bv = _r(b); - - if (av < bv) - return -1; - if (av > bv) - return 1; - return 0; -} #ifdef MCOUNT_SORT_ENABLED pthread_t mcount_sort_thread; From 157fb5b3cfd2cb5950314f926a76e567fc1921c5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:17 -0500 Subject: [PATCH 48/56] scripts/sorttable: Convert Elf_Ehdr to union In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions for both, replace the Elf_Ehdr macro with a union that defines both Elf64_Ehdr and Elf32_Ehdr, with field e64 for the 64bit version, and e32 for the 32bit version. Then a macro etype can be used instead to get to the proper value. This will eventually be replaced with just single functions that can handle both 32bit and 64bit ELF parsing. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162345.148224465@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 36 ++++++++++++++++++++---------------- scripts/sorttable.h | 12 ++++++------ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 3e2c17e91485..67cbbfc8214d 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -64,6 +64,11 @@ #define EM_LOONGARCH 258 #endif +typedef union { + Elf32_Ehdr e32; + Elf64_Ehdr e64; +} Elf_Ehdr; + static uint32_t (*r)(const uint32_t *); static uint16_t (*r2)(const uint16_t *); static uint64_t (*r8)(const uint64_t *); @@ -266,10 +271,10 @@ static void sort_relative_table_with_data(char *extab_image, int image_size) static int do_file(char const *const fname, void *addr) { int rc = -1; - Elf32_Ehdr *ehdr = addr; + Elf_Ehdr *ehdr = addr; table_sort_t custom_sort = NULL; - switch (ehdr->e_ident[EI_DATA]) { + switch (ehdr->e32.e_ident[EI_DATA]) { case ELFDATA2LSB: r = rle; r2 = r2le; @@ -284,18 +289,18 @@ static int do_file(char const *const fname, void *addr) break; default: fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", - ehdr->e_ident[EI_DATA], fname); + ehdr->e32.e_ident[EI_DATA], fname); return -1; } - if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 || - (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN) || - ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + if (memcmp(ELFMAG, ehdr->e32.e_ident, SELFMAG) != 0 || + (r2(&ehdr->e32.e_type) != ET_EXEC && r2(&ehdr->e32.e_type) != ET_DYN) || + ehdr->e32.e_ident[EI_VERSION] != EV_CURRENT) { fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname); return -1; } - switch (r2(&ehdr->e_machine)) { + switch (r2(&ehdr->e32.e_machine)) { case EM_386: case EM_AARCH64: case EM_LOONGARCH: @@ -318,14 +323,14 @@ static int do_file(char const *const fname, void *addr) break; default: fprintf(stderr, "unrecognized e_machine %d %s\n", - r2(&ehdr->e_machine), fname); + r2(&ehdr->e32.e_machine), fname); return -1; } - switch (ehdr->e_ident[EI_CLASS]) { + switch (ehdr->e32.e_ident[EI_CLASS]) { case ELFCLASS32: - if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || - r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { + if (r2(&ehdr->e32.e_ehsize) != sizeof(Elf32_Ehdr) || + r2(&ehdr->e32.e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); break; @@ -334,20 +339,19 @@ static int do_file(char const *const fname, void *addr) break; case ELFCLASS64: { - Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; - if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr) || - r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { + if (r2(&ehdr->e64.e_ehsize) != sizeof(Elf64_Ehdr) || + r2(&ehdr->e64.e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); break; } - rc = do_sort_64(ghdr, fname, custom_sort); + rc = do_sort_64(ehdr, fname, custom_sort); } break; default: fprintf(stderr, "unrecognized ELF class %d %s\n", - ehdr->e_ident[EI_CLASS], fname); + ehdr->e32.e_ident[EI_CLASS], fname); break; } diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 36655ff16b39..be8b529498fb 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,12 +23,12 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef Elf_Ehdr #undef Elf_Shdr #undef Elf_Sym #undef ELF_ST_TYPE #undef uint_t #undef _r +#undef etype #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -37,12 +37,12 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr # define Elf_Sym Elf64_Sym # define ELF_ST_TYPE ELF64_ST_TYPE # define uint_t uint64_t # define _r r8 +# define etype e64 #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -50,12 +50,12 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr # define Elf_Sym Elf32_Sym # define ELF_ST_TYPE ELF32_ST_TYPE # define uint_t uint32_t # define _r r +# define etype e32 #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) @@ -222,7 +222,7 @@ static int do_sort(Elf_Ehdr *ehdr, table_sort_t custom_sort) { int rc = -1; - Elf_Shdr *s, *shdr = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->e_shoff)); + Elf_Shdr *s, *shdr = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->etype.e_shoff)); Elf_Shdr *strtab_sec = NULL; Elf_Shdr *symtab_sec = NULL; Elf_Shdr *extab_sec = NULL; @@ -249,12 +249,12 @@ static int do_sort(Elf_Ehdr *ehdr, unsigned int orc_num_entries = 0; #endif - shstrndx = r2(&ehdr->e_shstrndx); + shstrndx = r2(&ehdr->etype.e_shstrndx); if (shstrndx == SHN_XINDEX) shstrndx = r(&shdr[0].sh_link); secstrings = (const char *)ehdr + _r(&shdr[shstrndx].sh_offset); - shnum = r2(&ehdr->e_shnum); + shnum = r2(&ehdr->etype.e_shnum); if (shnum == SHN_UNDEF) shnum = _r(&shdr[0].sh_size); From 545f6cf8f4c9a268e0bab2637f1d279679befdbf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:18 -0500 Subject: [PATCH 49/56] scripts/sorttable: Replace Elf_Shdr Macro with a union In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions for both, replace the Elf_Shdr macro with a union that defines both Elf64_Shdr and Elf32_Shdr, with field e64 for the 64bit version, and e32 for the 32bit version. It can then use the macro etype to get the proper value. This will eventually be replaced with just single functions that can handle both 32bit and 64bit ELF parsing. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162345.339462681@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 10 ++++++ scripts/sorttable.h | 74 +++++++++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 67cbbfc8214d..94497b8ab04c 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -69,6 +69,11 @@ typedef union { Elf64_Ehdr e64; } Elf_Ehdr; +typedef union { + Elf32_Shdr e32; + Elf64_Shdr e64; +} Elf_Shdr; + static uint32_t (*r)(const uint32_t *); static uint16_t (*r2)(const uint16_t *); static uint64_t (*r8)(const uint64_t *); @@ -198,6 +203,11 @@ static int compare_extable_64(const void *a, const void *b) return av > bv; } +static inline void *get_index(void *start, int entsize, int index) +{ + return start + (entsize * index); +} + /* 32 bit and 64 bit are very similar */ #include "sorttable.h" #define SORTTABLE_64 diff --git a/scripts/sorttable.h b/scripts/sorttable.h index be8b529498fb..3daf37bb6b9a 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,7 +23,6 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef Elf_Shdr #undef Elf_Sym #undef ELF_ST_TYPE #undef uint_t @@ -37,7 +36,6 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define Elf_Shdr Elf64_Shdr # define Elf_Sym Elf64_Sym # define ELF_ST_TYPE ELF64_ST_TYPE # define uint_t uint64_t @@ -50,7 +48,6 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define Elf_Shdr Elf32_Shdr # define Elf_Sym Elf32_Sym # define ELF_ST_TYPE ELF32_ST_TYPE # define uint_t uint32_t @@ -171,8 +168,8 @@ struct elf_mcount_loc { static void *sort_mcount_loc(void *arg) { struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; - uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->sh_addr) - + _r(&(emloc->init_data_sec)->sh_offset); + uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->etype.sh_addr) + + _r(&(emloc->init_data_sec)->etype.sh_offset); uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; unsigned char *start_loc = (void *)emloc->ehdr + offset; @@ -222,10 +219,11 @@ static int do_sort(Elf_Ehdr *ehdr, table_sort_t custom_sort) { int rc = -1; - Elf_Shdr *s, *shdr = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->etype.e_shoff)); + Elf_Shdr *shdr_start; Elf_Shdr *strtab_sec = NULL; Elf_Shdr *symtab_sec = NULL; Elf_Shdr *extab_sec = NULL; + Elf_Shdr *string_sec; Elf_Sym *sym; const Elf_Sym *symtab; Elf32_Word *symtab_shndx = NULL; @@ -235,7 +233,10 @@ static int do_sort(Elf_Ehdr *ehdr, const char *secstrings; const char *strtab; char *extab_image; + int sort_need_index; + int shentsize; int idx; + int i; unsigned int shnum; unsigned int shstrndx; #ifdef MCOUNT_SORT_ENABLED @@ -249,34 +250,40 @@ static int do_sort(Elf_Ehdr *ehdr, unsigned int orc_num_entries = 0; #endif + shdr_start = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->etype.e_shoff)); + shentsize = r2(&ehdr->etype.e_shentsize); + shstrndx = r2(&ehdr->etype.e_shstrndx); if (shstrndx == SHN_XINDEX) - shstrndx = r(&shdr[0].sh_link); - secstrings = (const char *)ehdr + _r(&shdr[shstrndx].sh_offset); + shstrndx = r(&shdr_start->etype.sh_link); + string_sec = get_index(shdr_start, shentsize, shstrndx); + secstrings = (const char *)ehdr + _r(&string_sec->etype.sh_offset); shnum = r2(&ehdr->etype.e_shnum); if (shnum == SHN_UNDEF) - shnum = _r(&shdr[0].sh_size); + shnum = _r(&shdr_start->etype.sh_size); - for (s = shdr; s < shdr + shnum; s++) { - idx = r(&s->sh_name); + for (i = 0; i < shnum; i++) { + Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); + + idx = r(&shdr->etype.sh_name); if (!strcmp(secstrings + idx, "__ex_table")) - extab_sec = s; + extab_sec = shdr; if (!strcmp(secstrings + idx, ".symtab")) - symtab_sec = s; + symtab_sec = shdr; if (!strcmp(secstrings + idx, ".strtab")) - strtab_sec = s; + strtab_sec = shdr; - if (r(&s->sh_type) == SHT_SYMTAB_SHNDX) + if (r(&shdr->etype.sh_type) == SHT_SYMTAB_SHNDX) symtab_shndx = (Elf32_Word *)((const char *)ehdr + - _r(&s->sh_offset)); + _r(&shdr->etype.sh_offset)); #ifdef MCOUNT_SORT_ENABLED /* locate the .init.data section in vmlinux */ if (!strcmp(secstrings + idx, ".init.data")) { get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); mstruct.ehdr = ehdr; - mstruct.init_data_sec = s; + mstruct.init_data_sec = shdr; mstruct.start_mcount_loc = _start_mcount_loc; mstruct.stop_mcount_loc = _stop_mcount_loc; } @@ -285,14 +292,14 @@ static int do_sort(Elf_Ehdr *ehdr, #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* locate the ORC unwind tables */ if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { - orc_ip_size = _r(&s->sh_size); + orc_ip_size = _r(&shdr->etype.sh_size); g_orc_ip_table = (int *)((void *)ehdr + - _r(&s->sh_offset)); + _r(&shdr->etype.sh_offset)); } if (!strcmp(secstrings + idx, ".orc_unwind")) { - orc_size = _r(&s->sh_size); + orc_size = _r(&shdr->etype.sh_size); g_orc_table = (struct orc_entry *)((void *)ehdr + - _r(&s->sh_offset)); + _r(&shdr->etype.sh_offset)); } #endif } /* for loop */ @@ -355,22 +362,22 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } - extab_image = (void *)ehdr + _r(&extab_sec->sh_offset); - strtab = (const char *)ehdr + _r(&strtab_sec->sh_offset); + extab_image = (void *)ehdr + _r(&extab_sec->etype.sh_offset); + strtab = (const char *)ehdr + _r(&strtab_sec->etype.sh_offset); symtab = (const Elf_Sym *)((const char *)ehdr + - _r(&symtab_sec->sh_offset)); + _r(&symtab_sec->etype.sh_offset)); if (custom_sort) { - custom_sort(extab_image, _r(&extab_sec->sh_size)); + custom_sort(extab_image, _r(&extab_sec->etype.sh_size)); } else { - int num_entries = _r(&extab_sec->sh_size) / extable_ent_size; + int num_entries = _r(&extab_sec->etype.sh_size) / extable_ent_size; qsort(extab_image, num_entries, extable_ent_size, compare_extable); } /* find the flag main_extable_sort_needed */ - for (sym = (void *)ehdr + _r(&symtab_sec->sh_offset); - sym < sym + _r(&symtab_sec->sh_size) / sizeof(Elf_Sym); + for (sym = (void *)ehdr + _r(&symtab_sec->etype.sh_offset); + sym < sym + _r(&symtab_sec->etype.sh_size) / sizeof(Elf_Sym); sym++) { if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) continue; @@ -388,13 +395,14 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } - sort_needed_sec = &shdr[get_secindex(r2(&sym->st_shndx), - sort_needed_sym - symtab, - symtab_shndx)]; + sort_need_index = get_secindex(r2(&sym->st_shndx), + sort_needed_sym - symtab, + symtab_shndx); + sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); sort_needed_loc = (void *)ehdr + - _r(&sort_needed_sec->sh_offset) + + _r(&sort_needed_sec->etype.sh_offset) + _r(&sort_needed_sym->st_value) - - _r(&sort_needed_sec->sh_addr); + _r(&sort_needed_sec->etype.sh_addr); /* extable has been sorted, clear the flag */ w(0, sort_needed_loc); From 200d015e73b4da69bcd8212a7c58695452b12bad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:19 -0500 Subject: [PATCH 50/56] scripts/sorttable: Convert Elf_Sym MACRO over to a union In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions for both, replace the Elf_Sym macro with a union that defines both Elf64_Sym and Elf32_Sym, with field e64 for the 64bit version, and e32 for the 32bit version. It can then use the macro etype to get the proper value. This will eventually be replaced with just single functions that can handle both 32bit and 64bit ELF parsing. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162345.528626969@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 5 +++++ scripts/sorttable.h | 25 ++++++++++++++----------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 94497b8ab04c..57792cf2aa89 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -74,6 +74,11 @@ typedef union { Elf64_Shdr e64; } Elf_Shdr; +typedef union { + Elf32_Sym e32; + Elf64_Sym e64; +} Elf_Sym; + static uint32_t (*r)(const uint32_t *); static uint16_t (*r2)(const uint16_t *); static uint64_t (*r8)(const uint64_t *); diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 3daf37bb6b9a..cd4429c8a9f4 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,7 +23,6 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef Elf_Sym #undef ELF_ST_TYPE #undef uint_t #undef _r @@ -36,7 +35,6 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define Elf_Sym Elf64_Sym # define ELF_ST_TYPE ELF64_ST_TYPE # define uint_t uint64_t # define _r r8 @@ -48,7 +46,6 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define Elf_Sym Elf32_Sym # define ELF_ST_TYPE ELF32_ST_TYPE # define uint_t uint32_t # define _r r @@ -230,10 +227,13 @@ static int do_sort(Elf_Ehdr *ehdr, Elf_Sym *sort_needed_sym = NULL; Elf_Shdr *sort_needed_sec; uint32_t *sort_needed_loc; + void *sym_start; + void *sym_end; const char *secstrings; const char *strtab; char *extab_image; int sort_need_index; + int symentsize; int shentsize; int idx; int i; @@ -376,12 +376,15 @@ static int do_sort(Elf_Ehdr *ehdr, } /* find the flag main_extable_sort_needed */ - for (sym = (void *)ehdr + _r(&symtab_sec->etype.sh_offset); - sym < sym + _r(&symtab_sec->etype.sh_size) / sizeof(Elf_Sym); - sym++) { - if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) + sym_start = (void *)ehdr + _r(&symtab_sec->etype.sh_offset); + sym_end = sym_start + _r(&symtab_sec->etype.sh_size); + symentsize = _r(&symtab_sec->etype.sh_entsize); + + for (sym = sym_start; (void *)sym + symentsize < sym_end; + sym = (void *)sym + symentsize) { + if (ELF_ST_TYPE(sym->etype.st_info) != STT_OBJECT) continue; - if (!strcmp(strtab + r(&sym->st_name), + if (!strcmp(strtab + r(&sym->etype.st_name), "main_extable_sort_needed")) { sort_needed_sym = sym; break; @@ -395,13 +398,13 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } - sort_need_index = get_secindex(r2(&sym->st_shndx), - sort_needed_sym - symtab, + sort_need_index = get_secindex(r2(&sym->etype.st_shndx), + ((void *)sort_needed_sym - (void *)symtab) / symentsize, symtab_shndx); sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); sort_needed_loc = (void *)ehdr + _r(&sort_needed_sec->etype.sh_offset) + - _r(&sort_needed_sym->st_value) - + _r(&sort_needed_sym->etype.st_value) - _r(&sort_needed_sec->etype.sh_addr); /* extable has been sorted, clear the flag */ From 1dfb59a228dde59ad7d99b2fa2104e90004995c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:20 -0500 Subject: [PATCH 51/56] scripts/sorttable: Add helper functions for Elf_Ehdr In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions, add helper functions for Elf_Ehdr. This will create a function pointer for each helper that will get assigned to the appropriate function to handle either the 64bit or 32bit version. This also moves the _r()/r() wrappers for the Elf_Ehdr references that handle endian and size differences between the different architectures, into the helper function and out of the open code which is more error prone. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162345.736369526@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 25 +++++++++++++++++++++++++ scripts/sorttable.h | 20 ++++++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 57792cf2aa89..5dfa734eff09 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -85,6 +85,31 @@ static uint64_t (*r8)(const uint64_t *); static void (*w)(uint32_t, uint32_t *); typedef void (*table_sort_t)(char *, int); +static uint64_t ehdr64_shoff(Elf_Ehdr *ehdr) +{ + return r8(&ehdr->e64.e_shoff); +} + +static uint64_t ehdr32_shoff(Elf_Ehdr *ehdr) +{ + return r(&ehdr->e32.e_shoff); +} + +#define EHDR_HALF(fn_name) \ +static uint16_t ehdr64_##fn_name(Elf_Ehdr *ehdr) \ +{ \ + return r2(&ehdr->e64.e_##fn_name); \ +} \ + \ +static uint16_t ehdr32_##fn_name(Elf_Ehdr *ehdr) \ +{ \ + return r2(&ehdr->e32.e_##fn_name); \ +} + +EHDR_HALF(shentsize) +EHDR_HALF(shstrndx) +EHDR_HALF(shnum) + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap diff --git a/scripts/sorttable.h b/scripts/sorttable.h index cd4429c8a9f4..97278c973bc9 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -27,6 +27,10 @@ #undef uint_t #undef _r #undef etype +#undef ehdr_shoff +#undef ehdr_shentsize +#undef ehdr_shstrndx +#undef ehdr_shnum #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -39,6 +43,10 @@ # define uint_t uint64_t # define _r r8 # define etype e64 +# define ehdr_shoff ehdr64_shoff +# define ehdr_shentsize ehdr64_shentsize +# define ehdr_shstrndx ehdr64_shstrndx +# define ehdr_shnum ehdr64_shnum #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -50,6 +58,10 @@ # define uint_t uint32_t # define _r r # define etype e32 +# define ehdr_shoff ehdr32_shoff +# define ehdr_shentsize ehdr32_shentsize +# define ehdr_shstrndx ehdr32_shstrndx +# define ehdr_shnum ehdr32_shnum #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) @@ -250,16 +262,16 @@ static int do_sort(Elf_Ehdr *ehdr, unsigned int orc_num_entries = 0; #endif - shdr_start = (Elf_Shdr *)((char *)ehdr + _r(&ehdr->etype.e_shoff)); - shentsize = r2(&ehdr->etype.e_shentsize); + shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr)); + shentsize = ehdr_shentsize(ehdr); - shstrndx = r2(&ehdr->etype.e_shstrndx); + shstrndx = ehdr_shstrndx(ehdr); if (shstrndx == SHN_XINDEX) shstrndx = r(&shdr_start->etype.sh_link); string_sec = get_index(shdr_start, shentsize, shstrndx); secstrings = (const char *)ehdr + _r(&string_sec->etype.sh_offset); - shnum = r2(&ehdr->etype.e_shnum); + shnum = ehdr_shnum(ehdr); if (shnum == SHN_UNDEF) shnum = _r(&shdr_start->etype.sh_size); From 67afb7f504400e5b4e5ff895459fbb3eb63d4450 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:21 -0500 Subject: [PATCH 52/56] scripts/sorttable: Add helper functions for Elf_Shdr In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions, add helper functions for Elf_Shdr. This will create a function pointer for each helper that will get assigned to the appropriate function to handle either the 64bit or 32bit version. This also moves the _r()/r() wrappers for the Elf_Shdr references that handle endian and size differences between the different architectures, into the helper function and out of the open code which is more error prone. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162345.940924221@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 42 +++++++++++++++++++++++++++++ scripts/sorttable.h | 66 +++++++++++++++++++++++++++++---------------- 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 5dfa734eff09..b2b96ff261d6 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -110,6 +110,48 @@ EHDR_HALF(shentsize) EHDR_HALF(shstrndx) EHDR_HALF(shnum) +#define SHDR_WORD(fn_name) \ +static uint32_t shdr64_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r(&shdr->e64.sh_##fn_name); \ +} \ + \ +static uint32_t shdr32_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r(&shdr->e32.sh_##fn_name); \ +} + +#define SHDR_ADDR(fn_name) \ +static uint64_t shdr64_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r8(&shdr->e64.sh_##fn_name); \ +} \ + \ +static uint64_t shdr32_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r(&shdr->e32.sh_##fn_name); \ +} + +#define SHDR_WORD(fn_name) \ +static uint32_t shdr64_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r(&shdr->e64.sh_##fn_name); \ +} \ + \ +static uint32_t shdr32_##fn_name(Elf_Shdr *shdr) \ +{ \ + return r(&shdr->e32.sh_##fn_name); \ +} + +SHDR_ADDR(addr) +SHDR_ADDR(offset) +SHDR_ADDR(size) +SHDR_ADDR(entsize) + +SHDR_WORD(link) +SHDR_WORD(name) +SHDR_WORD(type) + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 97278c973bc9..af3a5f0209a3 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -31,6 +31,13 @@ #undef ehdr_shentsize #undef ehdr_shstrndx #undef ehdr_shnum +#undef shdr_addr +#undef shdr_offset +#undef shdr_link +#undef shdr_size +#undef shdr_name +#undef shdr_type +#undef shdr_entsize #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -47,6 +54,13 @@ # define ehdr_shentsize ehdr64_shentsize # define ehdr_shstrndx ehdr64_shstrndx # define ehdr_shnum ehdr64_shnum +# define shdr_addr shdr64_addr +# define shdr_offset shdr64_offset +# define shdr_link shdr64_link +# define shdr_size shdr64_size +# define shdr_name shdr64_name +# define shdr_type shdr64_type +# define shdr_entsize shdr64_entsize #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -62,6 +76,13 @@ # define ehdr_shentsize ehdr32_shentsize # define ehdr_shstrndx ehdr32_shstrndx # define ehdr_shnum ehdr32_shnum +# define shdr_addr shdr32_addr +# define shdr_offset shdr32_offset +# define shdr_link shdr32_link +# define shdr_size shdr32_size +# define shdr_name shdr32_name +# define shdr_type shdr32_type +# define shdr_entsize shdr32_entsize #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) @@ -177,8 +198,8 @@ struct elf_mcount_loc { static void *sort_mcount_loc(void *arg) { struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; - uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->etype.sh_addr) - + _r(&(emloc->init_data_sec)->etype.sh_offset); + uint_t offset = emloc->start_mcount_loc - shdr_addr(emloc->init_data_sec) + + shdr_offset(emloc->init_data_sec); uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; unsigned char *start_loc = (void *)emloc->ehdr + offset; @@ -267,18 +288,18 @@ static int do_sort(Elf_Ehdr *ehdr, shstrndx = ehdr_shstrndx(ehdr); if (shstrndx == SHN_XINDEX) - shstrndx = r(&shdr_start->etype.sh_link); + shstrndx = shdr_link(shdr_start); string_sec = get_index(shdr_start, shentsize, shstrndx); - secstrings = (const char *)ehdr + _r(&string_sec->etype.sh_offset); + secstrings = (const char *)ehdr + shdr_offset(string_sec); shnum = ehdr_shnum(ehdr); if (shnum == SHN_UNDEF) - shnum = _r(&shdr_start->etype.sh_size); + shnum = shdr_size(shdr_start); for (i = 0; i < shnum; i++) { Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); - idx = r(&shdr->etype.sh_name); + idx = shdr_name(shdr); if (!strcmp(secstrings + idx, "__ex_table")) extab_sec = shdr; if (!strcmp(secstrings + idx, ".symtab")) @@ -286,9 +307,9 @@ static int do_sort(Elf_Ehdr *ehdr, if (!strcmp(secstrings + idx, ".strtab")) strtab_sec = shdr; - if (r(&shdr->etype.sh_type) == SHT_SYMTAB_SHNDX) + if (shdr_type(shdr) == SHT_SYMTAB_SHNDX) symtab_shndx = (Elf32_Word *)((const char *)ehdr + - _r(&shdr->etype.sh_offset)); + shdr_offset(shdr)); #ifdef MCOUNT_SORT_ENABLED /* locate the .init.data section in vmlinux */ @@ -304,14 +325,14 @@ static int do_sort(Elf_Ehdr *ehdr, #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* locate the ORC unwind tables */ if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { - orc_ip_size = _r(&shdr->etype.sh_size); + orc_ip_size = shdr_size(shdr); g_orc_ip_table = (int *)((void *)ehdr + - _r(&shdr->etype.sh_offset)); + shdr_offset(shdr)); } if (!strcmp(secstrings + idx, ".orc_unwind")) { - orc_size = _r(&shdr->etype.sh_size); + orc_size = shdr_size(shdr); g_orc_table = (struct orc_entry *)((void *)ehdr + - _r(&shdr->etype.sh_offset)); + shdr_offset(shdr)); } #endif } /* for loop */ @@ -374,23 +395,22 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } - extab_image = (void *)ehdr + _r(&extab_sec->etype.sh_offset); - strtab = (const char *)ehdr + _r(&strtab_sec->etype.sh_offset); - symtab = (const Elf_Sym *)((const char *)ehdr + - _r(&symtab_sec->etype.sh_offset)); + extab_image = (void *)ehdr + shdr_offset(extab_sec); + strtab = (const char *)ehdr + shdr_offset(strtab_sec); + symtab = (const Elf_Sym *)((const char *)ehdr + shdr_offset(symtab_sec)); if (custom_sort) { - custom_sort(extab_image, _r(&extab_sec->etype.sh_size)); + custom_sort(extab_image, shdr_size(extab_sec)); } else { - int num_entries = _r(&extab_sec->etype.sh_size) / extable_ent_size; + int num_entries = shdr_size(extab_sec) / extable_ent_size; qsort(extab_image, num_entries, extable_ent_size, compare_extable); } /* find the flag main_extable_sort_needed */ - sym_start = (void *)ehdr + _r(&symtab_sec->etype.sh_offset); - sym_end = sym_start + _r(&symtab_sec->etype.sh_size); - symentsize = _r(&symtab_sec->etype.sh_entsize); + sym_start = (void *)ehdr + shdr_offset(symtab_sec); + sym_end = sym_start + shdr_size(symtab_sec); + symentsize = shdr_entsize(symtab_sec); for (sym = sym_start; (void *)sym + symentsize < sym_end; sym = (void *)sym + symentsize) { @@ -415,9 +435,9 @@ static int do_sort(Elf_Ehdr *ehdr, symtab_shndx); sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); sort_needed_loc = (void *)ehdr + - _r(&sort_needed_sec->etype.sh_offset) + + shdr_offset(sort_needed_sec) + _r(&sort_needed_sym->etype.st_value) - - _r(&sort_needed_sec->etype.sh_addr); + shdr_addr(sort_needed_sec); /* extable has been sorted, clear the flag */ w(0, sort_needed_loc); From 17bed33ac12f011f4695059960e1b1d6457229a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:22 -0500 Subject: [PATCH 53/56] scripts/sorttable: Add helper functions for Elf_Sym In order to remove the double #include of sorttable.h for 64 and 32 bit to create duplicate functions, add helper functions for Elf_Sym. This will create a function pointer for each helper that will get assigned to the appropriate function to handle either the 64bit or 32bit version. This also removes the last references of etype and _r() macros from the sorttable.h file as their references are now just defined in the appropriate architecture version of the helper functions. All read functions now exist in the helper functions which makes it easier to maintain, as the helper functions define the necessary architecture sizes. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162346.185740651@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ scripts/sorttable.h | 30 +++++++++++++++-------------- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index b2b96ff261d6..20615de18276 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -152,6 +152,53 @@ SHDR_WORD(link) SHDR_WORD(name) SHDR_WORD(type) +#define SYM_ADDR(fn_name) \ +static uint64_t sym64_##fn_name(Elf_Sym *sym) \ +{ \ + return r8(&sym->e64.st_##fn_name); \ +} \ + \ +static uint64_t sym32_##fn_name(Elf_Sym *sym) \ +{ \ + return r(&sym->e32.st_##fn_name); \ +} + +#define SYM_WORD(fn_name) \ +static uint32_t sym64_##fn_name(Elf_Sym *sym) \ +{ \ + return r(&sym->e64.st_##fn_name); \ +} \ + \ +static uint32_t sym32_##fn_name(Elf_Sym *sym) \ +{ \ + return r(&sym->e32.st_##fn_name); \ +} + +#define SYM_HALF(fn_name) \ +static uint16_t sym64_##fn_name(Elf_Sym *sym) \ +{ \ + return r2(&sym->e64.st_##fn_name); \ +} \ + \ +static uint16_t sym32_##fn_name(Elf_Sym *sym) \ +{ \ + return r2(&sym->e32.st_##fn_name); \ +} + +static uint8_t sym64_type(Elf_Sym *sym) +{ + return ELF64_ST_TYPE(sym->e64.st_info); +} + +static uint8_t sym32_type(Elf_Sym *sym) +{ + return ELF32_ST_TYPE(sym->e32.st_info); +} + +SYM_ADDR(value) +SYM_WORD(name) +SYM_HALF(shndx) + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap diff --git a/scripts/sorttable.h b/scripts/sorttable.h index af3a5f0209a3..ef7e5161db31 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,10 +23,7 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef ELF_ST_TYPE #undef uint_t -#undef _r -#undef etype #undef ehdr_shoff #undef ehdr_shentsize #undef ehdr_shstrndx @@ -38,6 +35,10 @@ #undef shdr_name #undef shdr_type #undef shdr_entsize +#undef sym_type +#undef sym_name +#undef sym_value +#undef sym_shndx #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -46,10 +47,7 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define ELF_ST_TYPE ELF64_ST_TYPE # define uint_t uint64_t -# define _r r8 -# define etype e64 # define ehdr_shoff ehdr64_shoff # define ehdr_shentsize ehdr64_shentsize # define ehdr_shstrndx ehdr64_shstrndx @@ -61,6 +59,10 @@ # define shdr_name shdr64_name # define shdr_type shdr64_type # define shdr_entsize shdr64_entsize +# define sym_type sym64_type +# define sym_name sym64_name +# define sym_value sym64_value +# define sym_shndx sym64_shndx #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -68,10 +70,7 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define ELF_ST_TYPE ELF32_ST_TYPE # define uint_t uint32_t -# define _r r -# define etype e32 # define ehdr_shoff ehdr32_shoff # define ehdr_shentsize ehdr32_shentsize # define ehdr_shstrndx ehdr32_shstrndx @@ -83,6 +82,10 @@ # define shdr_name shdr32_name # define shdr_type shdr32_type # define shdr_entsize shdr32_entsize +# define sym_type sym32_type +# define sym_name sym32_name +# define sym_value sym32_value +# define sym_shndx sym32_shndx #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) @@ -414,9 +417,9 @@ static int do_sort(Elf_Ehdr *ehdr, for (sym = sym_start; (void *)sym + symentsize < sym_end; sym = (void *)sym + symentsize) { - if (ELF_ST_TYPE(sym->etype.st_info) != STT_OBJECT) + if (sym_type(sym) != STT_OBJECT) continue; - if (!strcmp(strtab + r(&sym->etype.st_name), + if (!strcmp(strtab + sym_name(sym), "main_extable_sort_needed")) { sort_needed_sym = sym; break; @@ -430,14 +433,13 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } - sort_need_index = get_secindex(r2(&sym->etype.st_shndx), + sort_need_index = get_secindex(sym_shndx(sym), ((void *)sort_needed_sym - (void *)symtab) / symentsize, symtab_shndx); sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); sort_needed_loc = (void *)ehdr + shdr_offset(sort_needed_sec) + - _r(&sort_needed_sym->etype.st_value) - - shdr_addr(sort_needed_sec); + sym_value(sort_needed_sym) - shdr_addr(sort_needed_sec); /* extable has been sorted, clear the flag */ w(0, sort_needed_loc); From 1b649e6ab8dc9188d82c64069493afe66ca0edad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:23 -0500 Subject: [PATCH 54/56] scripts/sorttable: Use uint64_t for mcount sorting The mcount sorting defines uint_t to uint64_t on 64bit architectures and uint32_t on 32bit architectures. It can work with just using uint64_t as that will hold the values of both, and they are not used to point into the ELF file. sizeof(uint_t) is used for defining the size of the mcount_loc section. Instead of using a type, define long_size and use that instead. This will allow the header code to be moved into the C file as generic functions and not need to include sorttable.h twice, once for 64bit and once for 32bit. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162346.373528925@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index ef7e5161db31..17a8541a10d6 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -23,7 +23,6 @@ #undef sort_mcount_loc #undef elf_mcount_loc #undef do_sort -#undef uint_t #undef ehdr_shoff #undef ehdr_shentsize #undef ehdr_shstrndx @@ -39,6 +38,7 @@ #undef sym_name #undef sym_value #undef sym_shndx +#undef long_size #ifdef SORTTABLE_64 # define extable_ent_size 16 @@ -47,7 +47,6 @@ # define sort_mcount_loc sort_mcount_loc_64 # define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 -# define uint_t uint64_t # define ehdr_shoff ehdr64_shoff # define ehdr_shentsize ehdr64_shentsize # define ehdr_shstrndx ehdr64_shstrndx @@ -63,6 +62,7 @@ # define sym_name sym64_name # define sym_value sym64_value # define sym_shndx sym64_shndx +# define long_size 8 #else # define extable_ent_size 8 # define compare_extable compare_extable_32 @@ -70,7 +70,6 @@ # define sort_mcount_loc sort_mcount_loc_32 # define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 -# define uint_t uint32_t # define ehdr_shoff ehdr32_shoff # define ehdr_shentsize ehdr32_shentsize # define ehdr_shstrndx ehdr32_shstrndx @@ -86,6 +85,7 @@ # define sym_name sym32_name # define sym_value sym32_value # define sym_shndx sym32_shndx +# define long_size 4 #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) @@ -193,25 +193,25 @@ pthread_t mcount_sort_thread; struct elf_mcount_loc { Elf_Ehdr *ehdr; Elf_Shdr *init_data_sec; - uint_t start_mcount_loc; - uint_t stop_mcount_loc; + uint64_t start_mcount_loc; + uint64_t stop_mcount_loc; }; /* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */ static void *sort_mcount_loc(void *arg) { struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; - uint_t offset = emloc->start_mcount_loc - shdr_addr(emloc->init_data_sec) + uint64_t offset = emloc->start_mcount_loc - shdr_addr(emloc->init_data_sec) + shdr_offset(emloc->init_data_sec); - uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; + uint64_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; unsigned char *start_loc = (void *)emloc->ehdr + offset; - qsort(start_loc, count/sizeof(uint_t), sizeof(uint_t), compare_extable); + qsort(start_loc, count/long_size, long_size, compare_extable); return NULL; } /* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ -static void get_mcount_loc(uint_t *_start, uint_t *_stop) +static void get_mcount_loc(uint64_t *_start, uint64_t *_stop) { FILE *file_start, *file_stop; char start_buff[20]; @@ -277,8 +277,8 @@ static int do_sort(Elf_Ehdr *ehdr, unsigned int shstrndx; #ifdef MCOUNT_SORT_ENABLED struct elf_mcount_loc mstruct = {0}; - uint_t _start_mcount_loc = 0; - uint_t _stop_mcount_loc = 0; + uint64_t _start_mcount_loc = 0; + uint64_t _stop_mcount_loc = 0; #endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) unsigned int orc_ip_size = 0; From 58d87678a0f46c6120904b4326aaf5ebf4454c69 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Jan 2025 22:32:17 -0500 Subject: [PATCH 55/56] scripts/sorttable: Move code from sorttable.h into sorttable.c Instead of having the main code live in a header file and included twice with MACROs that define the Elf structures for 64 bit or 32 bit, move the code in the C file now that the Elf structures are defined in a union that has both. All accesses to the Elf structure fields are done through helper function pointers. If the file being parsed if for a 64 bit architecture, all the helper functions point to the 64 bit versions to retrieve the Elf fields. The same is true if the architecture is 32 bit, where the function pointers will point to the 32 bit helper functions. Note, when the value of a field can be either 32 bit or 64 bit, a 64 bit is always returned, as it works for the 32 bit code as well. This makes the code easier to read and maintain, and it now all exists in sorttable.c and sorttable.h may be removed. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Cc: Stephen Rothwell Link: https://lore.kernel.org/20250107223217.6f7f96a5@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 473 ++++++++++++++++++++++++++++++++++++++++-- scripts/sorttable.h | 485 -------------------------------------------- 2 files changed, 460 insertions(+), 498 deletions(-) delete mode 100644 scripts/sorttable.h diff --git a/scripts/sorttable.c b/scripts/sorttable.c index 20615de18276..ff9b60fc0dd8 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -327,10 +327,423 @@ static inline void *get_index(void *start, int entsize, int index) return start + (entsize * index); } -/* 32 bit and 64 bit are very similar */ -#include "sorttable.h" -#define SORTTABLE_64 -#include "sorttable.h" + +static int (*compare_extable)(const void *a, const void *b); +static uint64_t (*ehdr_shoff)(Elf_Ehdr *ehdr); +static uint16_t (*ehdr_shstrndx)(Elf_Ehdr *ehdr); +static uint16_t (*ehdr_shentsize)(Elf_Ehdr *ehdr); +static uint16_t (*ehdr_shnum)(Elf_Ehdr *ehdr); +static uint64_t (*shdr_addr)(Elf_Shdr *shdr); +static uint64_t (*shdr_offset)(Elf_Shdr *shdr); +static uint64_t (*shdr_size)(Elf_Shdr *shdr); +static uint64_t (*shdr_entsize)(Elf_Shdr *shdr); +static uint32_t (*shdr_link)(Elf_Shdr *shdr); +static uint32_t (*shdr_name)(Elf_Shdr *shdr); +static uint32_t (*shdr_type)(Elf_Shdr *shdr); +static uint8_t (*sym_type)(Elf_Sym *sym); +static uint32_t (*sym_name)(Elf_Sym *sym); +static uint64_t (*sym_value)(Elf_Sym *sym); +static uint16_t (*sym_shndx)(Elf_Sym *sym); + +static int extable_ent_size; +static int long_size; + + +#ifdef UNWINDER_ORC_ENABLED +/* ORC unwinder only support X86_64 */ +#include + +#define ERRSTR_MAXSZ 256 + +static char g_err[ERRSTR_MAXSZ]; +static int *g_orc_ip_table; +static struct orc_entry *g_orc_table; + +static pthread_t orc_sort_thread; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a, *orc_b; + const int *a = g_orc_ip_table + *(int *)_a; + const int *b = g_orc_ip_table + *(int *)_b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = g_orc_table + (a - g_orc_ip_table); + orc_b = g_orc_table + (b - g_orc_ip_table); + if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) + return 0; + return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; +} + +static void *sort_orctable(void *arg) +{ + int i; + int *idxs = NULL; + int *tmp_orc_ip_table = NULL; + struct orc_entry *tmp_orc_table = NULL; + unsigned int *orc_ip_size = (unsigned int *)arg; + unsigned int num_entries = *orc_ip_size / sizeof(int); + unsigned int orc_size = num_entries * sizeof(struct orc_entry); + + idxs = (int *)malloc(*orc_ip_size); + if (!idxs) { + snprintf(g_err, ERRSTR_MAXSZ, "malloc idxs: %s", + strerror(errno)); + pthread_exit(g_err); + } + + tmp_orc_ip_table = (int *)malloc(*orc_ip_size); + if (!tmp_orc_ip_table) { + snprintf(g_err, ERRSTR_MAXSZ, "malloc tmp_orc_ip_table: %s", + strerror(errno)); + pthread_exit(g_err); + } + + tmp_orc_table = (struct orc_entry *)malloc(orc_size); + if (!tmp_orc_table) { + snprintf(g_err, ERRSTR_MAXSZ, "malloc tmp_orc_table: %s", + strerror(errno)); + pthread_exit(g_err); + } + + /* initialize indices array, convert ip_table to absolute address */ + for (i = 0; i < num_entries; i++) { + idxs[i] = i; + tmp_orc_ip_table[i] = g_orc_ip_table[i] + i * sizeof(int); + } + memcpy(tmp_orc_table, g_orc_table, orc_size); + + qsort(idxs, num_entries, sizeof(int), orc_sort_cmp); + + for (i = 0; i < num_entries; i++) { + if (idxs[i] == i) + continue; + + /* convert back to relative address */ + g_orc_ip_table[i] = tmp_orc_ip_table[idxs[i]] - i * sizeof(int); + g_orc_table[i] = tmp_orc_table[idxs[i]]; + } + + free(idxs); + free(tmp_orc_ip_table); + free(tmp_orc_table); + pthread_exit(NULL); +} +#endif + +#ifdef MCOUNT_SORT_ENABLED +static pthread_t mcount_sort_thread; + +struct elf_mcount_loc { + Elf_Ehdr *ehdr; + Elf_Shdr *init_data_sec; + uint64_t start_mcount_loc; + uint64_t stop_mcount_loc; +}; + +/* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */ +static void *sort_mcount_loc(void *arg) +{ + struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; + uint64_t offset = emloc->start_mcount_loc - shdr_addr(emloc->init_data_sec) + + shdr_offset(emloc->init_data_sec); + uint64_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; + unsigned char *start_loc = (void *)emloc->ehdr + offset; + + qsort(start_loc, count/long_size, long_size, compare_extable); + return NULL; +} + +/* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ +static void get_mcount_loc(uint64_t *_start, uint64_t *_stop) +{ + FILE *file_start, *file_stop; + char start_buff[20]; + char stop_buff[20]; + int len = 0; + + file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); + if (!file_start) { + fprintf(stderr, "get start_mcount_loc error!"); + return; + } + + file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); + if (!file_stop) { + fprintf(stderr, "get stop_mcount_loc error!"); + pclose(file_start); + return; + } + + while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) { + len = strlen(start_buff); + start_buff[len - 1] = '\0'; + } + *_start = strtoul(start_buff, NULL, 16); + + while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) { + len = strlen(stop_buff); + stop_buff[len - 1] = '\0'; + } + *_stop = strtoul(stop_buff, NULL, 16); + + pclose(file_start); + pclose(file_stop); +} +#endif +static int do_sort(Elf_Ehdr *ehdr, + char const *const fname, + table_sort_t custom_sort) +{ + int rc = -1; + Elf_Shdr *shdr_start; + Elf_Shdr *strtab_sec = NULL; + Elf_Shdr *symtab_sec = NULL; + Elf_Shdr *extab_sec = NULL; + Elf_Shdr *string_sec; + Elf_Sym *sym; + const Elf_Sym *symtab; + Elf32_Word *symtab_shndx = NULL; + Elf_Sym *sort_needed_sym = NULL; + Elf_Shdr *sort_needed_sec; + uint32_t *sort_needed_loc; + void *sym_start; + void *sym_end; + const char *secstrings; + const char *strtab; + char *extab_image; + int sort_need_index; + int symentsize; + int shentsize; + int idx; + int i; + unsigned int shnum; + unsigned int shstrndx; +#ifdef MCOUNT_SORT_ENABLED + struct elf_mcount_loc mstruct = {0}; + uint64_t _start_mcount_loc = 0; + uint64_t _stop_mcount_loc = 0; +#endif +#ifdef UNWINDER_ORC_ENABLED + unsigned int orc_ip_size = 0; + unsigned int orc_size = 0; + unsigned int orc_num_entries = 0; +#endif + + shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr)); + shentsize = ehdr_shentsize(ehdr); + + shstrndx = ehdr_shstrndx(ehdr); + if (shstrndx == SHN_XINDEX) + shstrndx = shdr_link(shdr_start); + string_sec = get_index(shdr_start, shentsize, shstrndx); + secstrings = (const char *)ehdr + shdr_offset(string_sec); + + shnum = ehdr_shnum(ehdr); + if (shnum == SHN_UNDEF) + shnum = shdr_size(shdr_start); + + for (i = 0; i < shnum; i++) { + Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); + + idx = shdr_name(shdr); + if (!strcmp(secstrings + idx, "__ex_table")) + extab_sec = shdr; + if (!strcmp(secstrings + idx, ".symtab")) + symtab_sec = shdr; + if (!strcmp(secstrings + idx, ".strtab")) + strtab_sec = shdr; + + if (shdr_type(shdr) == SHT_SYMTAB_SHNDX) + symtab_shndx = (Elf32_Word *)((const char *)ehdr + + shdr_offset(shdr)); + +#ifdef MCOUNT_SORT_ENABLED + /* locate the .init.data section in vmlinux */ + if (!strcmp(secstrings + idx, ".init.data")) { + get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); + mstruct.ehdr = ehdr; + mstruct.init_data_sec = shdr; + mstruct.start_mcount_loc = _start_mcount_loc; + mstruct.stop_mcount_loc = _stop_mcount_loc; + } +#endif + +#ifdef UNWINDER_ORC_ENABLED + /* locate the ORC unwind tables */ + if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { + orc_ip_size = shdr_size(shdr); + g_orc_ip_table = (int *)((void *)ehdr + + shdr_offset(shdr)); + } + if (!strcmp(secstrings + idx, ".orc_unwind")) { + orc_size = shdr_size(shdr); + g_orc_table = (struct orc_entry *)((void *)ehdr + + shdr_offset(shdr)); + } +#endif + } /* for loop */ + +#ifdef UNWINDER_ORC_ENABLED + if (!g_orc_ip_table || !g_orc_table) { + fprintf(stderr, + "incomplete ORC unwind tables in file: %s\n", fname); + goto out; + } + + orc_num_entries = orc_ip_size / sizeof(int); + if (orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + orc_num_entries != orc_size / sizeof(struct orc_entry)) { + fprintf(stderr, + "inconsistent ORC unwind table entries in file: %s\n", + fname); + goto out; + } + + /* create thread to sort ORC unwind tables concurrently */ + if (pthread_create(&orc_sort_thread, NULL, + sort_orctable, &orc_ip_size)) { + fprintf(stderr, + "pthread_create orc_sort_thread failed '%s': %s\n", + strerror(errno), fname); + goto out; + } +#endif + +#ifdef MCOUNT_SORT_ENABLED + if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) { + fprintf(stderr, + "incomplete mcount's sort in file: %s\n", + fname); + goto out; + } + + /* create thread to sort mcount_loc concurrently */ + if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { + fprintf(stderr, + "pthread_create mcount_sort_thread failed '%s': %s\n", + strerror(errno), fname); + goto out; + } +#endif + if (!extab_sec) { + fprintf(stderr, "no __ex_table in file: %s\n", fname); + goto out; + } + + if (!symtab_sec) { + fprintf(stderr, "no .symtab in file: %s\n", fname); + goto out; + } + + if (!strtab_sec) { + fprintf(stderr, "no .strtab in file: %s\n", fname); + goto out; + } + + extab_image = (void *)ehdr + shdr_offset(extab_sec); + strtab = (const char *)ehdr + shdr_offset(strtab_sec); + symtab = (const Elf_Sym *)((const char *)ehdr + shdr_offset(symtab_sec)); + + if (custom_sort) { + custom_sort(extab_image, shdr_size(extab_sec)); + } else { + int num_entries = shdr_size(extab_sec) / extable_ent_size; + qsort(extab_image, num_entries, + extable_ent_size, compare_extable); + } + + /* find the flag main_extable_sort_needed */ + sym_start = (void *)ehdr + shdr_offset(symtab_sec); + sym_end = sym_start + shdr_size(symtab_sec); + symentsize = shdr_entsize(symtab_sec); + + for (sym = sym_start; (void *)sym + symentsize < sym_end; + sym = (void *)sym + symentsize) { + if (sym_type(sym) != STT_OBJECT) + continue; + if (!strcmp(strtab + sym_name(sym), + "main_extable_sort_needed")) { + sort_needed_sym = sym; + break; + } + } + + if (!sort_needed_sym) { + fprintf(stderr, + "no main_extable_sort_needed symbol in file: %s\n", + fname); + goto out; + } + + sort_need_index = get_secindex(sym_shndx(sym), + ((void *)sort_needed_sym - (void *)symtab) / symentsize, + symtab_shndx); + sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); + sort_needed_loc = (void *)ehdr + + shdr_offset(sort_needed_sec) + + sym_value(sort_needed_sym) - shdr_addr(sort_needed_sec); + + /* extable has been sorted, clear the flag */ + w(0, sort_needed_loc); + rc = 0; + +out: +#ifdef UNWINDER_ORC_ENABLED + if (orc_sort_thread) { + void *retval = NULL; + /* wait for ORC tables sort done */ + rc = pthread_join(orc_sort_thread, &retval); + if (rc) { + fprintf(stderr, + "pthread_join failed '%s': %s\n", + strerror(errno), fname); + } else if (retval) { + rc = -1; + fprintf(stderr, + "failed to sort ORC tables '%s': %s\n", + (char *)retval, fname); + } + } +#endif + +#ifdef MCOUNT_SORT_ENABLED + if (mcount_sort_thread) { + void *retval = NULL; + /* wait for mcount sort done */ + rc = pthread_join(mcount_sort_thread, &retval); + if (rc) { + fprintf(stderr, + "pthread_join failed '%s': %s\n", + strerror(errno), fname); + } else if (retval) { + rc = -1; + fprintf(stderr, + "failed to sort mcount '%s': %s\n", + (char *)retval, fname); + } + } +#endif + return rc; +} static int compare_relative_table(const void *a, const void *b) { @@ -399,7 +812,6 @@ static void sort_relative_table_with_data(char *extab_image, int image_size) static int do_file(char const *const fname, void *addr) { - int rc = -1; Elf_Ehdr *ehdr = addr; table_sort_t custom_sort = NULL; @@ -462,29 +874,64 @@ static int do_file(char const *const fname, void *addr) r2(&ehdr->e32.e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); - break; + return -1; } - rc = do_sort_32(ehdr, fname, custom_sort); + + compare_extable = compare_extable_32; + ehdr_shoff = ehdr32_shoff; + ehdr_shentsize = ehdr32_shentsize; + ehdr_shstrndx = ehdr32_shstrndx; + ehdr_shnum = ehdr32_shnum; + shdr_addr = shdr32_addr; + shdr_offset = shdr32_offset; + shdr_link = shdr32_link; + shdr_size = shdr32_size; + shdr_name = shdr32_name; + shdr_type = shdr32_type; + shdr_entsize = shdr32_entsize; + sym_type = sym32_type; + sym_name = sym32_name; + sym_value = sym32_value; + sym_shndx = sym32_shndx; + long_size = 4; + extable_ent_size = 8; break; case ELFCLASS64: - { if (r2(&ehdr->e64.e_ehsize) != sizeof(Elf64_Ehdr) || r2(&ehdr->e64.e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); - break; - } - rc = do_sort_64(ehdr, fname, custom_sort); + return -1; } + + compare_extable = compare_extable_64; + ehdr_shoff = ehdr64_shoff; + ehdr_shentsize = ehdr64_shentsize; + ehdr_shstrndx = ehdr64_shstrndx; + ehdr_shnum = ehdr64_shnum; + shdr_addr = shdr64_addr; + shdr_offset = shdr64_offset; + shdr_link = shdr64_link; + shdr_size = shdr64_size; + shdr_name = shdr64_name; + shdr_type = shdr64_type; + shdr_entsize = shdr64_entsize; + sym_type = sym64_type; + sym_name = sym64_name; + sym_value = sym64_value; + sym_shndx = sym64_shndx; + long_size = 8; + extable_ent_size = 16; + break; default: fprintf(stderr, "unrecognized ELF class %d %s\n", ehdr->e32.e_ident[EI_CLASS], fname); - break; + return -1; } - return rc; + return do_sort(ehdr, fname, custom_sort); } int main(int argc, char *argv[]) diff --git a/scripts/sorttable.h b/scripts/sorttable.h deleted file mode 100644 index 17a8541a10d6..000000000000 --- a/scripts/sorttable.h +++ /dev/null @@ -1,485 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sorttable.h - * - * Added ORC unwind tables sort support and other updates: - * Copyright (C) 1999-2019 Alibaba Group Holding Limited. by: - * Shile Zhang - * - * Copyright 2011 - 2012 Cavium, Inc. - * - * Some of code was taken out of arch/x86/kernel/unwind_orc.c, written by: - * Copyright (C) 2017 Josh Poimboeuf - * - * Some of this code was taken out of recordmcount.h written by: - * - * Copyright 2009 John F. Reiser . All rights reserved. - * Copyright 2010 Steven Rostedt , Red Hat Inc. - */ - -#undef extable_ent_size -#undef compare_extable -#undef get_mcount_loc -#undef sort_mcount_loc -#undef elf_mcount_loc -#undef do_sort -#undef ehdr_shoff -#undef ehdr_shentsize -#undef ehdr_shstrndx -#undef ehdr_shnum -#undef shdr_addr -#undef shdr_offset -#undef shdr_link -#undef shdr_size -#undef shdr_name -#undef shdr_type -#undef shdr_entsize -#undef sym_type -#undef sym_name -#undef sym_value -#undef sym_shndx -#undef long_size - -#ifdef SORTTABLE_64 -# define extable_ent_size 16 -# define compare_extable compare_extable_64 -# define get_mcount_loc get_mcount_loc_64 -# define sort_mcount_loc sort_mcount_loc_64 -# define elf_mcount_loc elf_mcount_loc_64 -# define do_sort do_sort_64 -# define ehdr_shoff ehdr64_shoff -# define ehdr_shentsize ehdr64_shentsize -# define ehdr_shstrndx ehdr64_shstrndx -# define ehdr_shnum ehdr64_shnum -# define shdr_addr shdr64_addr -# define shdr_offset shdr64_offset -# define shdr_link shdr64_link -# define shdr_size shdr64_size -# define shdr_name shdr64_name -# define shdr_type shdr64_type -# define shdr_entsize shdr64_entsize -# define sym_type sym64_type -# define sym_name sym64_name -# define sym_value sym64_value -# define sym_shndx sym64_shndx -# define long_size 8 -#else -# define extable_ent_size 8 -# define compare_extable compare_extable_32 -# define get_mcount_loc get_mcount_loc_32 -# define sort_mcount_loc sort_mcount_loc_32 -# define elf_mcount_loc elf_mcount_loc_32 -# define do_sort do_sort_32 -# define ehdr_shoff ehdr32_shoff -# define ehdr_shentsize ehdr32_shentsize -# define ehdr_shstrndx ehdr32_shstrndx -# define ehdr_shnum ehdr32_shnum -# define shdr_addr shdr32_addr -# define shdr_offset shdr32_offset -# define shdr_link shdr32_link -# define shdr_size shdr32_size -# define shdr_name shdr32_name -# define shdr_type shdr32_type -# define shdr_entsize shdr32_entsize -# define sym_type sym32_type -# define sym_name sym32_name -# define sym_value sym32_value -# define sym_shndx sym32_shndx -# define long_size 4 -#endif - -#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) -/* ORC unwinder only support X86_64 */ -#include - -#define ERRSTR_MAXSZ 256 - -char g_err[ERRSTR_MAXSZ]; -int *g_orc_ip_table; -struct orc_entry *g_orc_table; - -pthread_t orc_sort_thread; - -static inline unsigned long orc_ip(const int *ip) -{ - return (unsigned long)ip + *ip; -} - -static int orc_sort_cmp(const void *_a, const void *_b) -{ - struct orc_entry *orc_a, *orc_b; - const int *a = g_orc_ip_table + *(int *)_a; - const int *b = g_orc_ip_table + *(int *)_b; - unsigned long a_val = orc_ip(a); - unsigned long b_val = orc_ip(b); - - if (a_val > b_val) - return 1; - if (a_val < b_val) - return -1; - - /* - * The "weak" section terminator entries need to always be on the left - * to ensure the lookup code skips them in favor of real entries. - * These terminator entries exist to handle any gaps created by - * whitelisted .o files which didn't get objtool generation. - */ - orc_a = g_orc_table + (a - g_orc_ip_table); - orc_b = g_orc_table + (b - g_orc_ip_table); - if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) - return 0; - return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; -} - -static void *sort_orctable(void *arg) -{ - int i; - int *idxs = NULL; - int *tmp_orc_ip_table = NULL; - struct orc_entry *tmp_orc_table = NULL; - unsigned int *orc_ip_size = (unsigned int *)arg; - unsigned int num_entries = *orc_ip_size / sizeof(int); - unsigned int orc_size = num_entries * sizeof(struct orc_entry); - - idxs = (int *)malloc(*orc_ip_size); - if (!idxs) { - snprintf(g_err, ERRSTR_MAXSZ, "malloc idxs: %s", - strerror(errno)); - pthread_exit(g_err); - } - - tmp_orc_ip_table = (int *)malloc(*orc_ip_size); - if (!tmp_orc_ip_table) { - snprintf(g_err, ERRSTR_MAXSZ, "malloc tmp_orc_ip_table: %s", - strerror(errno)); - pthread_exit(g_err); - } - - tmp_orc_table = (struct orc_entry *)malloc(orc_size); - if (!tmp_orc_table) { - snprintf(g_err, ERRSTR_MAXSZ, "malloc tmp_orc_table: %s", - strerror(errno)); - pthread_exit(g_err); - } - - /* initialize indices array, convert ip_table to absolute address */ - for (i = 0; i < num_entries; i++) { - idxs[i] = i; - tmp_orc_ip_table[i] = g_orc_ip_table[i] + i * sizeof(int); - } - memcpy(tmp_orc_table, g_orc_table, orc_size); - - qsort(idxs, num_entries, sizeof(int), orc_sort_cmp); - - for (i = 0; i < num_entries; i++) { - if (idxs[i] == i) - continue; - - /* convert back to relative address */ - g_orc_ip_table[i] = tmp_orc_ip_table[idxs[i]] - i * sizeof(int); - g_orc_table[i] = tmp_orc_table[idxs[i]]; - } - - free(idxs); - free(tmp_orc_ip_table); - free(tmp_orc_table); - pthread_exit(NULL); -} -#endif - -#ifdef MCOUNT_SORT_ENABLED -pthread_t mcount_sort_thread; - -struct elf_mcount_loc { - Elf_Ehdr *ehdr; - Elf_Shdr *init_data_sec; - uint64_t start_mcount_loc; - uint64_t stop_mcount_loc; -}; - -/* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */ -static void *sort_mcount_loc(void *arg) -{ - struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; - uint64_t offset = emloc->start_mcount_loc - shdr_addr(emloc->init_data_sec) - + shdr_offset(emloc->init_data_sec); - uint64_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; - unsigned char *start_loc = (void *)emloc->ehdr + offset; - - qsort(start_loc, count/long_size, long_size, compare_extable); - return NULL; -} - -/* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ -static void get_mcount_loc(uint64_t *_start, uint64_t *_stop) -{ - FILE *file_start, *file_stop; - char start_buff[20]; - char stop_buff[20]; - int len = 0; - - file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); - if (!file_start) { - fprintf(stderr, "get start_mcount_loc error!"); - return; - } - - file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); - if (!file_stop) { - fprintf(stderr, "get stop_mcount_loc error!"); - pclose(file_start); - return; - } - - while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) { - len = strlen(start_buff); - start_buff[len - 1] = '\0'; - } - *_start = strtoul(start_buff, NULL, 16); - - while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) { - len = strlen(stop_buff); - stop_buff[len - 1] = '\0'; - } - *_stop = strtoul(stop_buff, NULL, 16); - - pclose(file_start); - pclose(file_stop); -} -#endif -static int do_sort(Elf_Ehdr *ehdr, - char const *const fname, - table_sort_t custom_sort) -{ - int rc = -1; - Elf_Shdr *shdr_start; - Elf_Shdr *strtab_sec = NULL; - Elf_Shdr *symtab_sec = NULL; - Elf_Shdr *extab_sec = NULL; - Elf_Shdr *string_sec; - Elf_Sym *sym; - const Elf_Sym *symtab; - Elf32_Word *symtab_shndx = NULL; - Elf_Sym *sort_needed_sym = NULL; - Elf_Shdr *sort_needed_sec; - uint32_t *sort_needed_loc; - void *sym_start; - void *sym_end; - const char *secstrings; - const char *strtab; - char *extab_image; - int sort_need_index; - int symentsize; - int shentsize; - int idx; - int i; - unsigned int shnum; - unsigned int shstrndx; -#ifdef MCOUNT_SORT_ENABLED - struct elf_mcount_loc mstruct = {0}; - uint64_t _start_mcount_loc = 0; - uint64_t _stop_mcount_loc = 0; -#endif -#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) - unsigned int orc_ip_size = 0; - unsigned int orc_size = 0; - unsigned int orc_num_entries = 0; -#endif - - shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr)); - shentsize = ehdr_shentsize(ehdr); - - shstrndx = ehdr_shstrndx(ehdr); - if (shstrndx == SHN_XINDEX) - shstrndx = shdr_link(shdr_start); - string_sec = get_index(shdr_start, shentsize, shstrndx); - secstrings = (const char *)ehdr + shdr_offset(string_sec); - - shnum = ehdr_shnum(ehdr); - if (shnum == SHN_UNDEF) - shnum = shdr_size(shdr_start); - - for (i = 0; i < shnum; i++) { - Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); - - idx = shdr_name(shdr); - if (!strcmp(secstrings + idx, "__ex_table")) - extab_sec = shdr; - if (!strcmp(secstrings + idx, ".symtab")) - symtab_sec = shdr; - if (!strcmp(secstrings + idx, ".strtab")) - strtab_sec = shdr; - - if (shdr_type(shdr) == SHT_SYMTAB_SHNDX) - symtab_shndx = (Elf32_Word *)((const char *)ehdr + - shdr_offset(shdr)); - -#ifdef MCOUNT_SORT_ENABLED - /* locate the .init.data section in vmlinux */ - if (!strcmp(secstrings + idx, ".init.data")) { - get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); - mstruct.ehdr = ehdr; - mstruct.init_data_sec = shdr; - mstruct.start_mcount_loc = _start_mcount_loc; - mstruct.stop_mcount_loc = _stop_mcount_loc; - } -#endif - -#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) - /* locate the ORC unwind tables */ - if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { - orc_ip_size = shdr_size(shdr); - g_orc_ip_table = (int *)((void *)ehdr + - shdr_offset(shdr)); - } - if (!strcmp(secstrings + idx, ".orc_unwind")) { - orc_size = shdr_size(shdr); - g_orc_table = (struct orc_entry *)((void *)ehdr + - shdr_offset(shdr)); - } -#endif - } /* for loop */ - -#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) - if (!g_orc_ip_table || !g_orc_table) { - fprintf(stderr, - "incomplete ORC unwind tables in file: %s\n", fname); - goto out; - } - - orc_num_entries = orc_ip_size / sizeof(int); - if (orc_ip_size % sizeof(int) != 0 || - orc_size % sizeof(struct orc_entry) != 0 || - orc_num_entries != orc_size / sizeof(struct orc_entry)) { - fprintf(stderr, - "inconsistent ORC unwind table entries in file: %s\n", - fname); - goto out; - } - - /* create thread to sort ORC unwind tables concurrently */ - if (pthread_create(&orc_sort_thread, NULL, - sort_orctable, &orc_ip_size)) { - fprintf(stderr, - "pthread_create orc_sort_thread failed '%s': %s\n", - strerror(errno), fname); - goto out; - } -#endif - -#ifdef MCOUNT_SORT_ENABLED - if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) { - fprintf(stderr, - "incomplete mcount's sort in file: %s\n", - fname); - goto out; - } - - /* create thread to sort mcount_loc concurrently */ - if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { - fprintf(stderr, - "pthread_create mcount_sort_thread failed '%s': %s\n", - strerror(errno), fname); - goto out; - } -#endif - if (!extab_sec) { - fprintf(stderr, "no __ex_table in file: %s\n", fname); - goto out; - } - - if (!symtab_sec) { - fprintf(stderr, "no .symtab in file: %s\n", fname); - goto out; - } - - if (!strtab_sec) { - fprintf(stderr, "no .strtab in file: %s\n", fname); - goto out; - } - - extab_image = (void *)ehdr + shdr_offset(extab_sec); - strtab = (const char *)ehdr + shdr_offset(strtab_sec); - symtab = (const Elf_Sym *)((const char *)ehdr + shdr_offset(symtab_sec)); - - if (custom_sort) { - custom_sort(extab_image, shdr_size(extab_sec)); - } else { - int num_entries = shdr_size(extab_sec) / extable_ent_size; - qsort(extab_image, num_entries, - extable_ent_size, compare_extable); - } - - /* find the flag main_extable_sort_needed */ - sym_start = (void *)ehdr + shdr_offset(symtab_sec); - sym_end = sym_start + shdr_size(symtab_sec); - symentsize = shdr_entsize(symtab_sec); - - for (sym = sym_start; (void *)sym + symentsize < sym_end; - sym = (void *)sym + symentsize) { - if (sym_type(sym) != STT_OBJECT) - continue; - if (!strcmp(strtab + sym_name(sym), - "main_extable_sort_needed")) { - sort_needed_sym = sym; - break; - } - } - - if (!sort_needed_sym) { - fprintf(stderr, - "no main_extable_sort_needed symbol in file: %s\n", - fname); - goto out; - } - - sort_need_index = get_secindex(sym_shndx(sym), - ((void *)sort_needed_sym - (void *)symtab) / symentsize, - symtab_shndx); - sort_needed_sec = get_index(shdr_start, shentsize, sort_need_index); - sort_needed_loc = (void *)ehdr + - shdr_offset(sort_needed_sec) + - sym_value(sort_needed_sym) - shdr_addr(sort_needed_sec); - - /* extable has been sorted, clear the flag */ - w(0, sort_needed_loc); - rc = 0; - -out: -#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) - if (orc_sort_thread) { - void *retval = NULL; - /* wait for ORC tables sort done */ - rc = pthread_join(orc_sort_thread, &retval); - if (rc) { - fprintf(stderr, - "pthread_join failed '%s': %s\n", - strerror(errno), fname); - } else if (retval) { - rc = -1; - fprintf(stderr, - "failed to sort ORC tables '%s': %s\n", - (char *)retval, fname); - } - } -#endif - -#ifdef MCOUNT_SORT_ENABLED - if (mcount_sort_thread) { - void *retval = NULL; - /* wait for mcount sort done */ - rc = pthread_join(mcount_sort_thread, &retval); - if (rc) { - fprintf(stderr, - "pthread_join failed '%s': %s\n", - strerror(errno), fname); - } else if (retval) { - rc = -1; - fprintf(stderr, - "failed to sort mcount '%s': %s\n", - (char *)retval, fname); - } - } -#endif - return rc; -} From 4acda8edefa1ce66d3de845f1c12745721cd14c3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 5 Jan 2025 11:22:25 -0500 Subject: [PATCH 56/56] scripts/sorttable: Get start/stop_mcount_loc from ELF file directly The get_mcount_loc() does a cheesy trick to find the start_mcount_loc and stop_mcount_loc values. That trick is: file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); and file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); Those values are stored in the Elf symbol table. Use that to capture those values. Using the symbol table is more efficient and more robust. The above could fail if another variable had "start_mcount" or "stop_mcount" as part of its name. Cc: bpf Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Zheng Yejian Cc: Martin Kelly Cc: Christophe Leroy Cc: Josh Poimboeuf Link: https://lore.kernel.org/20250105162346.817157047@goodmis.org Signed-off-by: Steven Rostedt (Google) --- scripts/sorttable.c | 95 +++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/scripts/sorttable.c b/scripts/sorttable.c index ff9b60fc0dd8..656c1e9b5ad9 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -472,42 +472,41 @@ static void *sort_mcount_loc(void *arg) } /* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ -static void get_mcount_loc(uint64_t *_start, uint64_t *_stop) +static void get_mcount_loc(struct elf_mcount_loc *emloc, Elf_Shdr *symtab_sec, + const char *strtab) { - FILE *file_start, *file_stop; - char start_buff[20]; - char stop_buff[20]; - int len = 0; + Elf_Sym *sym, *end_sym; + int symentsize = shdr_entsize(symtab_sec); + int found = 0; - file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); - if (!file_start) { + sym = (void *)emloc->ehdr + shdr_offset(symtab_sec); + end_sym = (void *)sym + shdr_size(symtab_sec); + + while (sym < end_sym) { + if (!strcmp(strtab + sym_name(sym), "__start_mcount_loc")) { + emloc->start_mcount_loc = sym_value(sym); + if (++found == 2) + break; + } else if (!strcmp(strtab + sym_name(sym), "__stop_mcount_loc")) { + emloc->stop_mcount_loc = sym_value(sym); + if (++found == 2) + break; + } + sym = (void *)sym + symentsize; + } + + if (!emloc->start_mcount_loc) { fprintf(stderr, "get start_mcount_loc error!"); return; } - file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); - if (!file_stop) { + if (!emloc->stop_mcount_loc) { fprintf(stderr, "get stop_mcount_loc error!"); - pclose(file_start); return; } - - while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) { - len = strlen(start_buff); - start_buff[len - 1] = '\0'; - } - *_start = strtoul(start_buff, NULL, 16); - - while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) { - len = strlen(stop_buff); - stop_buff[len - 1] = '\0'; - } - *_stop = strtoul(stop_buff, NULL, 16); - - pclose(file_start); - pclose(file_stop); } #endif + static int do_sort(Elf_Ehdr *ehdr, char const *const fname, table_sort_t custom_sort) @@ -538,8 +537,6 @@ static int do_sort(Elf_Ehdr *ehdr, unsigned int shstrndx; #ifdef MCOUNT_SORT_ENABLED struct elf_mcount_loc mstruct = {0}; - uint64_t _start_mcount_loc = 0; - uint64_t _stop_mcount_loc = 0; #endif #ifdef UNWINDER_ORC_ENABLED unsigned int orc_ip_size = 0; @@ -577,13 +574,8 @@ static int do_sort(Elf_Ehdr *ehdr, #ifdef MCOUNT_SORT_ENABLED /* locate the .init.data section in vmlinux */ - if (!strcmp(secstrings + idx, ".init.data")) { - get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); - mstruct.ehdr = ehdr; + if (!strcmp(secstrings + idx, ".init.data")) mstruct.init_data_sec = shdr; - mstruct.start_mcount_loc = _start_mcount_loc; - mstruct.stop_mcount_loc = _stop_mcount_loc; - } #endif #ifdef UNWINDER_ORC_ENABLED @@ -627,23 +619,6 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } #endif - -#ifdef MCOUNT_SORT_ENABLED - if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) { - fprintf(stderr, - "incomplete mcount's sort in file: %s\n", - fname); - goto out; - } - - /* create thread to sort mcount_loc concurrently */ - if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { - fprintf(stderr, - "pthread_create mcount_sort_thread failed '%s': %s\n", - strerror(errno), fname); - goto out; - } -#endif if (!extab_sec) { fprintf(stderr, "no __ex_table in file: %s\n", fname); goto out; @@ -663,6 +638,26 @@ static int do_sort(Elf_Ehdr *ehdr, strtab = (const char *)ehdr + shdr_offset(strtab_sec); symtab = (const Elf_Sym *)((const char *)ehdr + shdr_offset(symtab_sec)); +#ifdef MCOUNT_SORT_ENABLED + mstruct.ehdr = ehdr; + get_mcount_loc(&mstruct, symtab_sec, strtab); + + if (!mstruct.init_data_sec || !mstruct.start_mcount_loc || !mstruct.stop_mcount_loc) { + fprintf(stderr, + "incomplete mcount's sort in file: %s\n", + fname); + goto out; + } + + /* create thread to sort mcount_loc concurrently */ + if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { + fprintf(stderr, + "pthread_create mcount_sort_thread failed '%s': %s\n", + strerror(errno), fname); + goto out; + } +#endif + if (custom_sort) { custom_sort(extab_image, shdr_size(extab_sec)); } else {