Merge branch into tip/master: 'perf/core'

# New commits in perf/core:
    b709eb872e19 ("perf: map pages in advance")
    6d642735cdb6 ("perf/x86/intel/uncore: Support more units on Granite Rapids")
    3f710be02ea6 ("perf/x86/intel/uncore: Clean up func_id")
    0e45818ec189 ("perf/x86/intel: Support RDPMC metrics clear mode")
    02c56362a7d3 ("uprobes: Guard against kmemdup() failing in dup_return_instance()")
    d29e744c7167 ("perf/x86: Relax privilege filter restriction on AMD IBS")
    6057b90ecc84 ("perf/core: Export perf_exclude_event()")
    8622e45b5da1 ("uprobes: Reuse return_instances between multiple uretprobes within task")
    0cf981de7687 ("uprobes: Ensure return_instance is detached from the list before freeing")
    636666a1c733 ("uprobes: Decouple return_instance list traversal and freeing")
    2ff913ab3f47 ("uprobes: Simplify session consumer tracking")
    e0925f2dc4de ("uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution")
    83e3dc9a5d4d ("uprobes: simplify find_active_uprobe_rcu() VMA checks")
    03a001b156d2 ("mm: introduce mmap_lock_speculate_{try_begin|retry}")
    eb449bd96954 ("mm: convert mm_lock_seq to a proper seqcount")
    7528585290a1 ("mm/gup: Use raw_seqcount_try_begin()")
    96450ead1652 ("seqlock: add raw_seqcount_try_begin")
    b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs")
    54d2759778c1 ("perf/x86/rapl: Move the cntr_mask to rapl_pmus struct")
    bdc57ec70548 ("perf/x86/rapl: Remove the global variable rapl_msrs")
    abf03d9bd20c ("perf/x86/rapl: Modify the generic variable names to *_pkg*")
    eeca4c6b2529 ("perf/x86/rapl: Add arguments to the init and cleanup functions")
    cd29d83a6d81 ("perf/x86/rapl: Make rapl_model struct global")
    8bf1c86e5ac8 ("perf/x86/rapl: Rename rapl_pmu variables")
    1d5e2f637a94 ("perf/x86/rapl: Remove the cpu_to_rapl_pmu() function")
    e4b444347795 ("x86/topology: Introduce topology_logical_core_id()")
    2f2db347071a ("perf/x86/rapl: Remove the unused get_rapl_pmu_cpumask() function")
    ae55e308bde2 ("perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS")
    3c00ed344cef ("perf/x86/intel/ds: Factor out functions for PEBS records processing")
    7087bfb0adc9 ("perf/x86/intel/ds: Clarify adaptive PEBS processing")
    faac6f105ef1 ("perf/core: Check sample_type in perf_sample_save_brstack")
    f226805bc5f6 ("perf/core: Check sample_type in perf_sample_save_callchain")
    b9c44b91476b ("perf/core: Save raw sample data conditionally based on sample type")

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2025-01-11 17:05:00 +01:00
commit 40c78513bc
36 changed files with 958 additions and 499 deletions

View File

@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
"core_id."
- topology_logical_core_id();
The logical core ID to which a thread belongs.
System topology examples

View File

@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = cpuhw->usedss;
raw.frag.data = cpuhw->stop;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
cpuhw->flags &= ~PMU_F_ENABLED;
}
/* perf_exclude_event() - Filter event
/* perf_event_exclude() - Filter event
* @event: The perf event
* @regs: pt_regs structure
* @sde_regs: Sample-data-entry (sde) regs structure
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
*
* Return non-zero if the event shall be excluded.
*/
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
struct perf_sf_sde_regs *sde_regs)
{
if (event->attr.exclude_user && user_mode(regs))
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
overflow = 0;
if (perf_exclude_event(event, &regs, sde_regs))
if (perf_event_exclude(event, &regs, sde_regs))
goto out;
if (perf_event_overflow(event, &data, &regs)) {
overflow = 1;

View File

@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize;
raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
overflow = perf_event_overflow(event, &data, &regs);

View File

@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -31,6 +31,8 @@ static u32 ibs_caps;
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
/*
* IBS states:
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
if (has_branch_stack(event))
return -EOPNOTSUPP;
/* handle exclude_{user,kernel} in the IRQ handler */
if (event->attr.exclude_host || event->attr.exclude_guest ||
event->attr.exclude_idle)
return -EINVAL;
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
(event->attr.exclude_kernel || event->attr.exclude_user ||
event->attr.exclude_hv))
return -EINVAL;
ret = validate_group(event);
if (ret)
return ret;
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
NULL,
};
static struct attribute_group empty_format_group = {
.name = "format",
.attrs = attrs_empty,
};
static struct attribute_group empty_caps_group = {
.name = "caps",
.attrs = attrs_empty,
};
static const struct attribute_group *empty_attr_groups[] = {
&empty_format_group,
&empty_caps_group,
NULL,
};
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
}
static struct attribute *rand_en_attrs[] = {
static struct attribute *fetch_attrs[] = {
&format_attr_rand_en.attr,
&format_attr_swfilt.attr,
NULL,
};
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
NULL,
};
static struct attribute_group group_rand_en = {
static struct attribute_group group_fetch_formats = {
.name = "format",
.attrs = rand_en_attrs,
.attrs = fetch_attrs,
};
static struct attribute_group group_fetch_l3missonly = {
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
};
static const struct attribute_group *fetch_attr_groups[] = {
&group_rand_en,
&group_fetch_formats,
&empty_caps_group,
NULL,
};
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
}
static struct attribute *op_attrs[] = {
&format_attr_swfilt.attr,
NULL,
};
static struct attribute *cnt_ctl_attrs[] = {
&format_attr_cnt_ctl.attr,
NULL,
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
NULL,
};
static struct attribute_group group_op_formats = {
.name = "format",
.attrs = op_attrs,
};
static struct attribute_group group_cnt_ctl = {
.name = "format",
.attrs = cnt_ctl_attrs,
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
.is_visible = zen4_ibs_extensions_is_visible,
};
static const struct attribute_group *op_attr_groups[] = {
&group_op_formats,
&empty_caps_group,
NULL,
};
static const struct attribute_group *op_attr_update[] = {
&group_cnt_ctl,
&group_op_l3missonly,
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
@ -1111,6 +1128,12 @@ fail:
regs.flags |= PERF_EFLAGS_EXACT;
}
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
perf_exclude_event(event, &regs)) {
throttle = perf_event_account_interrupt(event);
goto out;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw = (struct perf_raw_record){
.frag = {
@ -1118,7 +1141,7 @@ fail:
.data = ibs_data.data,
},
};
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
}
if (perf_ibs == &perf_ibs_op)
@ -1129,8 +1152,7 @@ fail:
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(&data, event, iregs);
perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
out:
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_ZEN4)
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
perf_ibs_op.pmu.attr_groups = empty_attr_groups;
perf_ibs_op.pmu.attr_groups = op_attr_groups;
perf_ibs_op.pmu.attr_update = op_attr_update;
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");

View File

@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);

View File

@ -2826,6 +2826,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
return;
idx = INTEL_PMC_IDX_FIXED_SLOTS;
if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
bits |= INTEL_FIXED_3_METRICS_CLEAR;
}
intel_set_masks(event, idx);
@ -4081,7 +4084,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
* is used in a metrics group, it too cannot support sampling.
*/
if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2)
/* The metrics_clear can only be set for the slots event */
if (event->attr.config1 &&
(!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR)))
return -EINVAL;
if (event->attr.config2)
return -EINVAL;
/*
@ -4690,6 +4698,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" );
PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
static ssize_t umask2_show(struct device *dev,
struct device_attribute *attr,
char *page)
@ -4709,6 +4719,7 @@ static struct device_attribute format_attr_umask2 =
static struct attribute *format_evtsel_ext_attrs[] = {
&format_attr_umask2.attr,
&format_attr_eq.attr,
&format_attr_metrics_clear.attr,
NULL
};
@ -4733,6 +4744,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
if (i == 1)
return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
/* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
if (i == 2) {
union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
return intel_cap.rdpmc_metrics_clear ? attr->mode : 0;
}
return 0;
}

View File

@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
/*
* We use the interrupt regs as a base because the PEBS record does not
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 3)
setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event))
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
}
#define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
#define PEBS_RETIRE_LATENCY_OFFSET 32
/*
* With adaptive PEBS the layout depends on what fields are configured.
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_basic *basic = __pebs;
void *next_record = basic + 1;
u64 sample_type;
u64 format_size;
u64 sample_type, format_group;
struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs->xmm_regs = NULL;
sample_type = event->attr.sample_type;
format_size = basic->format_size;
format_group = basic->format_group;
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
if (sample_type & PERF_SAMPLE_CALLCHAIN)
perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
*regs = *iregs;
/* The ip in basic is EventingIP */
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
data->weight.var3_w = basic->retire_latency;
else
data->weight.var3_w = 0;
}
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
* Save the pointer here but process later.
*/
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
meminfo = next_record;
next_record = meminfo + 1;
}
if (format_size & PEBS_DATACFG_GP) {
if (format_group & PEBS_DATACFG_GP) {
gprs = next_record;
next_record = gprs + 1;
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
adaptive_pebs_save_regs(regs, gprs);
}
if (format_size & PEBS_DATACFG_MEMINFO) {
if (format_group & PEBS_DATACFG_MEMINFO) {
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 weight = meminfo->latency;
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
data->weight.var2_w = weight & PEBS_LATENCY_MASK;
weight >>= PEBS_CACHE_LATENCY_OFFSET;
}
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
data->weight.var2_w = meminfo->instr_latency;
/*
* Although meminfo::latency is defined as a u64,
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* in practice on Ice Lake and earlier platforms.
*/
if (sample_type & PERF_SAMPLE_WEIGHT) {
data->weight.full = weight ?:
data->weight.full = latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
} else {
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
data->weight.var1_dw = (u32)latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
}
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
if (format_size & PEBS_DATACFG_XMMS) {
if (format_group & PEBS_DATACFG_XMMS) {
struct pebs_xmm *xmm = next_record;
next_record = xmm + 1;
perf_regs->xmm_regs = xmm->xmm;
}
if (format_size & PEBS_DATACFG_LBRS) {
if (format_group & PEBS_DATACFG_LBRS) {
struct lbr_entry *lbr = next_record;
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
& 0xff) + 1;
next_record = next_record + num_lbr * sizeof(struct lbr_entry);
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
}
WARN_ONCE(next_record != __pebs + (format_size >> 48),
"PEBS record size %llu, expected %llu, config %llx\n",
format_size >> 48,
WARN_ONCE(next_record != __pebs + basic->format_size,
"PEBS record size %u, expected %llu, config %llx\n",
basic->format_size,
(u64)(next_record - __pebs),
basic->format_size);
format_group);
}
static inline void *
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0;
}
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
struct perf_sample_data *, struct pt_regs *);
static struct pt_regs dummy_iregs;
static __always_inline void
__intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
void (*setup_sample)(struct perf_event *,
struct pt_regs *,
void *,
struct perf_sample_data *,
struct pt_regs *))
void *at,
setup_fn setup_sample)
{
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
}
static __always_inline void
__intel_pmu_pebs_last_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *at,
int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else if (!intel_pmu_save_and_restart(event))
return;
if (!iregs)
iregs = &dummy_iregs;
while (count > 1) {
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) {
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0);
}
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else
intel_pmu_save_and_restart(event);
}
static __always_inline void
__intel_pmu_pebs_events(struct perf_event *event,
struct pt_regs *iregs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
int cnt = count;
if (!iregs)
iregs = &dummy_iregs;
while (cnt > 1) {
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
cnt--;
}
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
return;
}
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
__intel_pmu_pebs_events(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
}
}
}
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
struct perf_event *event;
void *base, *at, *top;
int bit;
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
return;
}
for (at = base; at < top; at += cpuc->pebs_record_size) {
if (!iregs)
iregs = &dummy_iregs;
/* Process all but the last event for each counter. */
for (at = base; at < top; at += basic->format_size) {
u64 pebs_status;
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
pebs_status &= mask;
basic = at;
if (basic->format_size != cpuc->pebs_record_size)
continue;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
counts[bit]++;
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
setup_pebs_adaptive_sample_data);
}
last[bit] = at;
}
}
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0)
if (!counts[bit])
continue;
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event))
continue;
if (WARN_ON_ONCE(!event->attr.precise_ip))
continue;
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
counts[bit], setup_pebs_adaptive_sample_data);
}
}

View File

@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
if (!pmu->registered)
return -ENOENT;
/* Sampling not supported yet */
@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types)
uncore_type_exit(*types);
}
static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
static int __init uncore_type_init(struct intel_uncore_type *type)
{
struct intel_uncore_pmu *pmus;
size_t size;
@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
pmus[i].pmu_idx = i;
pmus[i].type = type;
pmus[i].boxes = kzalloc(size, GFP_KERNEL);
@ -1055,12 +1054,12 @@ err:
}
static int __init
uncore_types_init(struct intel_uncore_type **types, bool setid)
uncore_types_init(struct intel_uncore_type **types)
{
int ret;
for (; *types; types++) {
ret = uncore_type_init(*types, setid);
ret = uncore_type_init(*types);
if (ret)
return ret;
}
@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
if (!box)
return -ENOMEM;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
box->dieid = die;
box->pci_dev = pdev;
@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void)
goto err;
}
ret = uncore_types_init(uncore_pci_uncores, false);
ret = uncore_types_init(uncore_pci_uncores);
if (ret)
goto errtype;
@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void)
{
int ret;
ret = uncore_types_init(uncore_msr_uncores, true);
ret = uncore_types_init(uncore_msr_uncores);
if (ret)
goto err;
@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void)
struct intel_uncore_type **types = uncore_mmio_uncores;
int ret;
ret = uncore_types_init(types, true);
ret = uncore_types_init(types);
if (ret)
goto err;

View File

@ -125,7 +125,6 @@ struct intel_uncore_pmu {
struct pmu pmu;
char name[UNCORE_PMU_NAME_LEN];
int pmu_idx;
int func_id;
bool registered;
atomic_t activeboxes;
cpumask_t cpu_mask;

View File

@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */
if (pmu->func_id < 0)
if (!pmu->registered)
return -ENOENT;
/* Sampling not supported yet */

View File

@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void)
/* GNR uncore support */
#define UNCORE_GNR_NUM_UNCORE_TYPES 23
#define UNCORE_GNR_TYPE_15 15
#define UNCORE_GNR_B2UPI 18
#define UNCORE_GNR_TYPE_21 21
#define UNCORE_GNR_TYPE_22 22
int gnr_uncore_units_ignore[] = {
UNCORE_SPR_UPI,
UNCORE_GNR_TYPE_15,
UNCORE_GNR_B2UPI,
UNCORE_GNR_TYPE_21,
UNCORE_GNR_TYPE_22,
UNCORE_IGNORE_END
};
@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = {
.attr_update = uncore_alias_groups,
};
static struct intel_uncore_type gnr_uncore_pciex8 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex8",
};
static struct intel_uncore_type gnr_uncore_pciex16 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex16",
};
static struct intel_uncore_type gnr_uncore_upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "upi",
};
static struct intel_uncore_type gnr_uncore_b2upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2upi",
};
static struct intel_uncore_type gnr_uncore_b2hot = {
.name = "b2hot",
.attr_update = uncore_alias_groups,
};
static struct intel_uncore_type gnr_uncore_b2cmi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2cmi",
@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
&gnr_uncore_ubox,
&spr_uncore_imc,
NULL,
&gnr_uncore_upi,
NULL,
NULL,
NULL,
&spr_uncore_cxlcm,
&spr_uncore_cxldp,
NULL,
NULL,
NULL,
NULL,
NULL,
&gnr_uncore_b2hot,
&gnr_uncore_b2cmi,
&gnr_uncore_b2cxl,
NULL,
&gnr_uncore_b2upi,
NULL,
&gnr_uncore_mdf_sbo,
NULL,
NULL,
&gnr_uncore_pciex16,
&gnr_uncore_pciex8,
};
static struct freerunning_counters gnr_iio_freerunning[] = {

View File

@ -624,6 +624,7 @@ union perf_capabilities {
u64 pebs_output_pt_available:1;
u64 pebs_timing_info:1;
u64 anythread_deprecated:1;
u64 rdpmc_metrics_clear:1;
};
u64 capabilities;
};

View File

@ -39,6 +39,10 @@
* event: rapl_energy_psys
* perf code: 0x5
*
* core counter: consumption of a single physical core
* event: rapl_energy_core (power_core PMU)
* perf code: 0x1
*
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
/*
* RAPL energy status counters
*/
enum perf_rapl_events {
enum perf_rapl_pkg_events {
PERF_RAPL_PP0 = 0, /* all cores */
PERF_RAPL_PKG, /* entire package */
PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */
PERF_RAPL_MAX,
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
PERF_RAPL_PKG_EVENTS_MAX,
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
};
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
#define PERF_RAPL_CORE 0 /* single core */
#define PERF_RAPL_CORE_EVENTS_MAX 1
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"psys",
};
static const char *const rapl_core_domain_name __initconst = "core";
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
#define rapl_pmu_is_pkg_scope() \
#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@ -129,7 +139,8 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
unsigned int cntr_mask;
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
};
struct rapl_model {
struct perf_msr *rapl_msrs;
unsigned long events;
struct perf_msr *rapl_pkg_msrs;
struct perf_msr *rapl_core_msrs;
unsigned long pkg_events;
unsigned long core_events;
unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk;
};
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
static unsigned int rapl_cntr_mask;
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
static int rapl_core_hw_unit __read_mostly;
static struct rapl_pmus *rapl_pmus_pkg;
static struct rapl_pmus *rapl_pmus_core;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
/*
* Helper functions to get the correct topology macros according to the
* Helper function to get the correct topology id according to the
* RAPL PMU scope.
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
{
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
{
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
topology_die_cpumask(cpu);
}
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
* Returns unsigned int, which converts the '-1' return value
* (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
switch (scope) {
case PERF_PMU_SCOPE_PKG:
return topology_logical_package_id(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_logical_die_id(cpu);
case PERF_PMU_SCOPE_CORE:
return topology_logical_core_id(cpu);
default:
return -EINVAL;
}
}
static inline u64 rapl_read_counter(struct perf_event *event)
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
static inline u64 rapl_scale(u64 v, int cfg)
static inline u64 rapl_scale(u64 v, struct perf_event *event)
{
if (cfg > NR_RAPL_DOMAINS) {
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
hw_unit = rapl_core_hw_unit;
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
return v << (32 - rapl_hw_unit[cfg - 1]);
return v << (32 - hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
sdelta = rapl_scale(delta, event->hw.config);
sdelta = rapl_scale(delta, event);
local64_add(sdelta, &event->count);
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
if (!rapl_pmu->n_active)
return HRTIMER_NORESTART;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry)
list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART;
}
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
struct hrtimer *hr = &pmu->hrtimer;
struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list);
list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++;
if (pmu->n_active == 1)
rapl_start_hrtimer(pmu);
rapl_pmu->n_active++;
if (rapl_pmu->n_active == 1)
rapl_start_hrtimer(rapl_pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
hrtimer_cancel(&pmu->hrtimer);
WARN_ON_ONCE(rapl_pmu->n_active <= 0);
rapl_pmu->n_active--;
if (rapl_pmu->n_active == 0)
hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry);
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
struct rapl_pmu *pmu = event->pmu_private;
struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags);
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
__rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags);
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0;
}
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
struct rapl_pmu *pmu;
int bit, rapl_pmus_scope, ret = 0;
struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
struct rapl_pmus *rapl_pmus;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
return -ENOENT;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
if (!rapl_pmus)
return -EINVAL;
rapl_pmus_scope = rapl_pmus->pmu.scope;
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
bit = cfg - 1;
if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
/* only look at RAPL package events */
if (event->attr.type != rapl_pmus_pkg->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
/* only look at RAPL core events */
if (event->attr.type != rapl_pmus_core->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
} else
return -EINVAL;
/* check event supported */
if (!(rapl_cntr_mask & (1 << bit)))
if (!(rapl_pmus->cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
if (event->attr.sample_period) /* no sampling */
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
return -EINVAL;
/* must be done before validate_group */
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
if (!rapl_pmu)
return -EINVAL;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->pmu_private = rapl_pmu;
event->hw.config = cfg;
event->hw.idx = bit;
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
/*
* There are no default events, but we need to create
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
NULL,
};
static const struct attribute_group *rapl_core_attr_groups[] = {
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit),
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
.attrs = rapl_events_psys,
};
static struct attribute *rapl_events_core[] = {
EVENT_PTR(rapl_core),
EVENT_PTR(rapl_core_unit),
EVENT_PTR(rapl_core_scale),
NULL,
};
static struct attribute_group rapl_events_core_group = {
.name = "events",
.attrs = rapl_events_core,
};
static bool test_msr(int idx, void *data)
{
return test_bit(idx, (unsigned long *) data);
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
};
/*
* Force to PERF_RAPL_MAX size due to:
* - perf_msr_probe(PERF_RAPL_MAX)
* Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
* - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
* - want to use same event codes across both architectures
*/
static struct perf_msr amd_rapl_msrs[] = {
static struct perf_msr amd_rapl_pkg_msrs[] = {
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
static int rapl_check_hw_unit(struct rapl_model *rm)
static struct perf_msr amd_rapl_core_msrs[] = {
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
test_msr, false, RAPL_MSR_MASK },
};
static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rm->unit_quirk) {
rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
case RAPL_UNIT_QUIRK_INTEL_HSW:
rapl_hw_unit[PERF_RAPL_RAM] = 16;
rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
break;
/* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR:
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
break;
default:
break;
}
/*
* Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
if (rapl_hw_unit[0] < 32) {
if (rapl_pkg_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100));
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
}
return 0;
}
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
static void __init rapl_advertise(void)
{
int i;
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
if (rapl_pmus_core)
num_counters += hweight32(rapl_pmus_core->cntr_mask);
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
hweight32(rapl_cntr_mask), rapl_timer_ms);
num_counters, rapl_timer_ms);
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
if (rapl_cntr_mask & (1 << i)) {
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_domain_names[i], rapl_hw_unit[i]);
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
}
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_core_domain_name, rapl_core_hw_unit);
}
static void cleanup_rapl_pmus(void)
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
kfree(rapl_pmus->pmus[i]);
kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus);
}
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL,
};
static int __init init_rapl_pmu(void)
static const struct attribute_group *rapl_core_attr_update[] = {
&rapl_events_core_group,
NULL,
};
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *pmu;
struct rapl_pmu *rapl_pmu;
int idx;
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
if (!pmu)
rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
if (!rapl_pmu)
goto free;
raw_spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
pmu->pmu = &rapl_pmus->pmu;
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
raw_spin_lock_init(&rapl_pmu->lock);
INIT_LIST_HEAD(&rapl_pmu->active_list);
rapl_pmu->pmu = &rapl_pmus->pmu;
rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(rapl_pmu);
rapl_pmus->pmus[idx] = pmu;
rapl_pmus->rapl_pmu[idx] = rapl_pmu;
}
return 0;
free:
for (; idx > 0; idx--)
kfree(rapl_pmus->pmus[idx - 1]);
kfree(rapl_pmus->rapl_pmu[idx - 1]);
return -ENOMEM;
}
static int __init init_rapl_pmus(void)
static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
const struct attribute_group **rapl_attr_groups,
const struct attribute_group **rapl_attr_update)
{
int nr_rapl_pmu = topology_max_packages();
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
struct rapl_pmus *rapl_pmus;
if (!rapl_pmu_is_pkg_scope()) {
nr_rapl_pmu *= topology_max_dies_per_package();
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
}
/*
* rapl_pmu_scope must be either PKG, DIE or CORE
*/
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu *= topology_max_dies_per_package();
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
nr_rapl_pmu *= topology_num_cores_per_package();
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
return -EINVAL;
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
*rapl_pmus_ptr = rapl_pmus;
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
return init_rapl_pmu();
return init_rapl_pmu(rapl_pmus);
}
static struct rapl_model model_snb = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_snbep = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsw = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsx = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_knl = {
.events = BIT(PERF_RAPL_PKG) |
.pkg_events = BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_skl = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1) |
BIT(PERF_RAPL_PSYS),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs,
.rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_spr = {
.events = BIT(PERF_RAPL_PP0) |
.pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_spr_msrs,
.rapl_pkg_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG),
.pkg_events = BIT(PERF_RAPL_PKG),
.core_events = BIT(PERF_RAPL_CORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
.rapl_core_msrs = amd_rapl_core_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
struct rapl_model *rm;
int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret;
if (rapl_pkg_pmu_is_pkg_scope())
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
rm = (struct rapl_model *) id->driver_data;
rapl_model = (struct rapl_model *) id->driver_data;
rapl_msrs = rm->rapl_msrs;
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
false, (void *) &rm->events);
ret = rapl_check_hw_unit(rm);
ret = rapl_check_hw_unit();
if (ret)
return ret;
ret = init_rapl_pmus();
ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
rapl_attr_update);
if (ret)
return ret;
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
PERF_RAPL_PKG_EVENTS_MAX, false,
(void *) &rapl_model->pkg_events);
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
if (ret)
goto out;
if (rapl_model->core_events) {
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
rapl_core_attr_groups,
rapl_core_attr_update);
if (ret) {
pr_warn("power-core PMU initialization failed (%d)\n", ret);
goto core_init_failed;
}
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
PERF_RAPL_CORE_EVENTS_MAX, false,
(void *) &rapl_model->core_events);
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
if (ret) {
pr_warn("power-core PMU registration failed (%d)\n", ret);
cleanup_rapl_pmus(rapl_pmus_core);
}
}
core_init_failed:
rapl_advertise();
return 0;
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
cleanup_rapl_pmus(rapl_pmus_pkg);
return ret;
}
module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
if (rapl_pmus_core) {
perf_pmu_unregister(&rapl_pmus_core->pmu);
cleanup_rapl_pmus(rapl_pmus_core);
}
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
cleanup_rapl_pmus(rapl_pmus_pkg);
}
module_exit(intel_rapl_exit);

View File

@ -41,6 +41,7 @@
#define INTEL_FIXED_0_USER (1ULL << 1)
#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2)
#define HSW_IN_TX (1ULL << 32)
#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code)
#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
#define INTEL_TD_METRIC_NUM 8
#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0
#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT)
static inline bool is_metric_idx(int idx)
{
return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM;
@ -422,7 +426,9 @@ static inline bool is_topdown_idx(int idx)
*/
struct pebs_basic {
u64 format_size;
u64 format_group:32,
retire_latency:16,
format_size:16;
u64 ip;
u64 applicable_counters;
u64 tsc;
@ -431,7 +437,17 @@ struct pebs_basic {
struct pebs_meminfo {
u64 address;
u64 aux;
u64 latency;
union {
/* pre Alder Lake */
u64 mem_latency;
/* Alder Lake and later */
struct {
u64 instr_latency:16;
u64 pad2:16;
u64 cache_latency:16;
u64 pad3:16;
};
};
u64 tsx_tuning;
};

View File

@ -98,6 +98,7 @@ struct cpuinfo_topology {
// Logical ID mappings
u32 logical_pkg_id;
u32 logical_die_id;
u32 logical_core_id;
// AMD Node ID and Nodes per Package info
u32 amd_node_id;

View File

@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin)

View File

@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);

View File

@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
}
/* Package relative core ID */

View File

@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock);
return false;
}
@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
*mm_lock_seq = vma->vm_mm->mm_lock_seq;
*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
return (vma->vm_lock_seq == *mm_lock_seq);
}
@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
int mm_lock_seq;
unsigned int mm_lock_seq;
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
int mm_lock_seq;
unsigned int mm_lock_seq;
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

View File

@ -727,7 +727,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
int vm_lock_seq;
unsigned int vm_lock_seq;
/* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock;
#endif
@ -921,6 +921,9 @@ struct mm_struct {
* Roughly speaking, incrementing the sequence number is
* equivalent to releasing locks on VMAs; reading the sequence
* number can be part of taking a read lock on a VMA.
* Incremented every time mmap_lock is write-locked/unlocked.
* Initialized to 0, therefore odd values indicate mmap_lock
* is write-locked and even values that it's released.
*
* Can be modified under write mmap_lock using RELEASE
* semantics.
@ -929,7 +932,7 @@ struct mm_struct {
* Can be read with ACQUIRE semantics if not holding write
* mmap_lock.
*/
int mm_lock_seq;
seqcount_t mm_lock_seq;
#endif

View File

@ -71,6 +71,91 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
}
#ifdef CONFIG_PER_VMA_LOCK
static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
seqcount_init(&mm->mm_lock_seq);
}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
{
do_raw_write_seqcount_begin(&mm->mm_lock_seq);
}
static inline void mm_lock_seqcount_end(struct mm_struct *mm)
{
ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
do_raw_write_seqcount_end(&mm->mm_lock_seq);
}
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
/*
* Since mmap_lock is a sleeping lock, and waiting for it to become
* unlocked is more or less equivalent with taking it ourselves, don't
* bother with the speculative path if mmap_lock is already write-locked
* and take the slow path, which takes the lock.
*/
return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
}
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
return read_seqcount_retry(&mm->mm_lock_seq, seq);
}
#else /* CONFIG_PER_VMA_LOCK */
static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
return false;
}
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
return true;
}
#endif /* CONFIG_PER_VMA_LOCK */
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
mm_lock_seqcount_init(mm);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
if (!ret)
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
}
/*
* Drop all currently-held per-VMA locks.
* This is called from the mmap_lock implementation directly before releasing
@ -82,46 +167,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
/*
* Nobody can concurrently modify mm->mm_lock_seq due to exclusive
* mmap_lock being held.
* We need RELEASE semantics here to ensure that preceding stores into
* the VMA take effect before we unlock it with this store.
* Pairs with ACQUIRE semantics in vma_start_read().
*/
smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
#endif
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
mm_lock_seqcount_end(mm);
}
static inline void mmap_write_unlock(struct mm_struct *mm)

View File

@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
{
int size = 1;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
return;
data->callchain = perf_callchain(event, regs);
size += data->callchain->nr;
@ -1287,12 +1292,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
}
static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
struct perf_event *event,
struct perf_raw_record *raw)
{
struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0;
int size;
if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
return;
do {
sum += frag->size;
if (perf_raw_frag_last(frag))
@ -1309,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
data->sample_flags |= PERF_SAMPLE_RAW;
}
static inline bool has_branch_stack(struct perf_event *event)
{
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}
static inline void perf_sample_save_brstack(struct perf_sample_data *data,
struct perf_event *event,
struct perf_branch_stack *brs,
@ -1316,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
{
int size = sizeof(u64); /* nr */
if (!has_branch_stack(event))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
return;
if (branch_sample_hw_index(event))
size += sizeof(u64);
size += brs->nr * sizeof(struct perf_branch_entry);
@ -1669,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}
extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);
extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
int entry_size, struct pt_regs *regs,
@ -1705,11 +1728,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
#endif
static inline bool has_branch_stack(struct perf_event *event)
{
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}
static inline bool needs_branch_stack(struct perf_event *event)
{
return event->attr.branch_sample_type != 0;
@ -1879,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
return 0;
}
static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
return 0;
}
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)

View File

@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
__seq; \
})
/**
* raw_seqcount_try_begin() - begin a seqcount_t read critical section
* w/o lockdep and w/o counter stabilization
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Similar to raw_seqcount_begin(), except it enables eliding the critical
* section entirely if odd, instead of doing the speculation knowing it will
* fail.
*
* Useful when counter stabilization is more or less equivalent to taking
* the lock and there is a slowpath that does that.
*
* If true, start will be set to the (even) sequence count read.
*
* Return: true when a read critical section is started.
*/
#define raw_seqcount_try_begin(s, start) \
({ \
start = raw_read_seqcount(s); \
!(start & 1); \
})
/**
* raw_seqcount_begin() - begin a seqcount_t read critical section w/o
* lockdep and w/o counter stabilization

View File

@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
struct uprobe;
struct vm_area_struct;
@ -124,6 +125,10 @@ struct uprobe_task {
unsigned int depth;
struct return_instance *return_instances;
struct return_instance *ri_pool;
struct timer_list ri_timer;
seqcount_t ri_seqcount;
union {
struct {
struct arch_uprobe_task autask;
@ -137,7 +142,6 @@ struct uprobe_task {
};
struct uprobe *active_uprobe;
struct timer_list ri_timer;
unsigned long xol_vaddr;
struct arch_uprobe *auprobe;
@ -154,12 +158,18 @@ struct return_instance {
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */
int consumers_cnt;
int cons_cnt; /* total number of session consumers */
struct return_instance *next; /* keep as stack */
struct rcu_head rcu;
struct return_consumer consumers[] __counted_by(consumers_cnt);
/* singular pre-allocated return_consumer instance for common case */
struct return_consumer consumer;
/*
* extra return_consumer instances for rare cases of multiple session consumers,
* contains (cons_cnt - 1) elements
*/
struct return_consumer *extra_consumers;
} ____cacheline_aligned;
enum rp_check {

View File

@ -6277,41 +6277,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
if (vmf->pgoff == 0)
ret = 0;
return ret;
}
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (!rb)
goto unlock;
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
goto unlock;
vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
if (!vmf->page)
goto unlock;
get_page(vmf->page);
vmf->page->mapping = vmf->vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
unlock:
rcu_read_unlock();
return ret;
}
static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *rb)
{
@ -6551,13 +6516,87 @@ out_put:
ring_buffer_put(rb); /* could be last */
}
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{
/* The first page is the user control page, others are read-only. */
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
};
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
unsigned long nr_pages = vma_pages(vma);
int err = 0;
unsigned long pagenum;
/*
* We map this as a VM_PFNMAP VMA.
*
* This is not ideal as this is designed broadly for mappings of PFNs
* referencing memory-mapped I/O ranges or non-system RAM i.e. for which
* !pfn_valid(pfn).
*
* We are mapping kernel-allocated memory (memory we manage ourselves)
* which would more ideally be mapped using vm_insert_page() or a
* similar mechanism, that is as a VM_MIXEDMAP mapping.
*
* However this won't work here, because:
*
* 1. It uses vma->vm_page_prot, but this field has not been completely
* setup at the point of the f_op->mmp() hook, so we are unable to
* indicate that this should be mapped CoW in order that the
* mkwrite() hook can be invoked to make the first page R/W and the
* rest R/O as desired.
*
* 2. Anything other than a VM_PFNMAP of valid PFNs will result in
* vm_normal_page() returning a struct page * pointer, which means
* vm_ops->page_mkwrite() will be invoked rather than
* vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
* to work around retry logic in the fault handler, however this
* field is no longer allowed to be used within struct page.
*
* 3. Having a struct page * made available in the fault logic also
* means that the page gets put on the rmap and becomes
* inappropriately accessible and subject to map and ref counting.
*
* Ideally we would have a mechanism that could explicitly express our
* desires, but this is not currently the case, so we instead use
* VM_PFNMAP.
*
* We manage the lifetime of these mappings with internal refcounts (see
* perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
* this mapping is maintained correctly.
*/
for (pagenum = 0; pagenum < nr_pages; pagenum++) {
unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
if (page == NULL) {
err = -EINVAL;
break;
}
/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
if (err)
break;
}
#ifdef CONFIG_MMU
/* Clear any partial mappings on error. */
if (err)
zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
#endif
return err;
}
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
@ -6682,6 +6721,8 @@ again:
goto again;
}
/* We need the rb to map pages. */
rb = event->rb;
goto unlock;
}
@ -6776,6 +6817,9 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
if (!ret)
ret = map_range(rb, vma);
if (event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
@ -10039,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, data, regs);
}
static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 1;
@ -10425,9 +10468,9 @@ static struct pmu perf_tracepoint = {
};
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
struct perf_raw_record *raw)
{
void *record = data->raw->frag.data;
void *record = raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@ -10439,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event,
}
static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data,
struct perf_raw_record *raw,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
@ -10450,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
if (!perf_tp_filter_match(event, data))
if (!perf_tp_filter_match(event, raw))
return 0;
return 1;
@ -10476,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
static void __perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs,
struct perf_sample_data *data,
struct perf_raw_record *raw,
struct perf_event *event)
{
struct trace_entry *entry = record;
@ -10485,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record,
/* Cannot deliver synchronous signal to other task. */
if (event->attr.sigtrap)
return;
if (perf_tp_event_match(event, data, regs))
if (perf_tp_event_match(event, raw, regs)) {
perf_sample_data_init(data, 0, 0);
perf_sample_save_raw_data(data, event, raw);
perf_swevent_event(event, count, data, regs);
}
}
static void perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs,
struct perf_sample_data *data,
struct perf_raw_record *raw,
struct perf_event_context *ctx)
{
unsigned int cpu = smp_processor_id();
@ -10499,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
struct perf_event *event, *sibling;
perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
__perf_tp_event_target_task(count, record, regs, data, event);
__perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event)
__perf_tp_event_target_task(count, record, regs, data, sibling);
__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
}
perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
__perf_tp_event_target_task(count, record, regs, data, event);
__perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event)
__perf_tp_event_target_task(count, record, regs, data, sibling);
__perf_tp_event_target_task(count, record, regs, data, raw, sibling);
}
}
@ -10525,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
},
};
perf_sample_data_init(&data, 0, 0);
perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs)) {
perf_swevent_event(event, count, &data, regs);
if (perf_tp_event_match(event, &raw, regs)) {
/*
* Here use the same on-stack perf_sample_data,
* some members in data are event-specific and
@ -10543,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
* because data->sample_flags is set.
*/
perf_sample_data_init(&data, 0, 0);
perf_sample_save_raw_data(&data, &raw);
perf_sample_save_raw_data(&data, event, &raw);
perf_swevent_event(event, count, &data, regs);
}
}
@ -10560,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
goto unlock;
raw_spin_lock(&ctx->lock);
perf_tp_event_target_task(count, record, regs, &data, ctx);
perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();

View File

@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page);
page->mapping = NULL;
__free_page(page);
}
@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr)
{
struct page *page = virt_to_page(addr);
page->mapping = NULL;
__free_page(page);
}
@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
static void perf_mmap_unmark_page(void *addr)
{
struct page *page = vmalloc_to_page(addr);
page->mapping = NULL;
}
static void rb_free_work(struct work_struct *work)
{
struct perf_buffer *rb;
void *base;
int i, nr;
rb = container_of(work, struct perf_buffer, work);
nr = data_page_nr(rb);
base = rb->user_page;
/* The '<=' counts in the user page. */
for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
vfree(rb->user_page);
kfree(rb);
}

View File

@ -1888,9 +1888,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
{
struct return_instance *next = ri->next;
ri->cons_cnt = 0;
ri->next = utask->ri_pool;
utask->ri_pool = ri;
}
static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
{
struct return_instance *ri = utask->ri_pool;
if (likely(ri))
utask->ri_pool = ri->next;
return ri;
}
static void ri_free(struct return_instance *ri)
{
kfree(ri->extra_consumers);
kfree_rcu(ri, rcu);
}
static void free_ret_instance(struct uprobe_task *utask,
struct return_instance *ri, bool cleanup_hprobe)
{
unsigned seq;
if (cleanup_hprobe) {
enum hprobe_state hstate;
@ -1899,8 +1923,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
hprobe_finalize(&ri->hprobe, hstate);
}
kfree_rcu(ri, rcu);
return next;
/*
* At this point return_instance is unlinked from utask's
* return_instances list and this has become visible to ri_timer().
* If seqcount now indicates that ri_timer's return instance
* processing loop isn't active, we can return ri into the pool of
* to-be-reused return instances for future uretprobes. If ri_timer()
* happens to be running right now, though, we fallback to safety and
* just perform RCU-delated freeing of ri.
*/
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
/* immediate reuse of ri without RCU GP is OK */
ri_pool_push(utask, ri);
} else {
/* we might be racing with ri_timer(), so play it safe */
ri_free(ri);
}
}
/*
@ -1910,7 +1948,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
struct return_instance *ri;
struct return_instance *ri, *ri_next;
if (!utask)
return;
@ -1921,8 +1959,19 @@ void uprobe_free_utask(struct task_struct *t)
timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
while (ri)
ri = free_ret_instance(ri, true /* cleanup_hprobe */);
while (ri) {
ri_next = ri->next;
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
ri = ri_next;
}
/* free_ret_instance() above might add to ri_pool, so this loop should come last */
ri = utask->ri_pool;
while (ri) {
ri_next = ri->next;
ri_free(ri);
ri = ri_next;
}
kfree(utask);
}
@ -1942,8 +1991,12 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */
guard(rcu)();
write_seqcount_begin(&utask->ri_seqcount);
for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false);
write_seqcount_end(&utask->ri_seqcount);
}
static struct uprobe_task *alloc_utask(void)
@ -1955,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void)
return NULL;
timer_setup(&utask->ri_timer, ri_timer, 0);
seqcount_init(&utask->ri_seqcount);
return utask;
}
@ -1974,32 +2028,40 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
static size_t ri_size(int consumers_cnt)
static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
{
struct return_instance *ri;
return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt;
}
ri = ri_pool_pop(utask);
if (ri)
return ri;
#define DEF_CNT 4
static struct return_instance *alloc_return_instance(void)
{
struct return_instance *ri;
ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL);
ri = kzalloc(sizeof(*ri), GFP_KERNEL);
if (!ri)
return ZERO_SIZE_PTR;
ri->consumers_cnt = DEF_CNT;
return ri;
}
static struct return_instance *dup_return_instance(struct return_instance *old)
{
size_t size = ri_size(old->consumers_cnt);
struct return_instance *ri;
return kmemdup(old, size, GFP_KERNEL);
ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
if (!ri)
return NULL;
if (unlikely(old->cons_cnt > 1)) {
ri->extra_consumers = kmemdup(old->extra_consumers,
sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
GFP_KERNEL);
if (!ri->extra_consumers) {
kfree(ri);
return NULL;
}
}
return ri;
}
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
@ -2108,14 +2170,17 @@ unsigned long uprobe_get_trampoline_vaddr(void)
static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs)
{
struct return_instance *ri = utask->return_instances;
struct return_instance *ri = utask->return_instances, *ri_next;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
ri = free_ret_instance(ri, true /* cleanup_hprobe */);
ri_next = ri->next;
rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
ri = ri_next;
}
rcu_assign_pointer(utask->return_instances, ri);
}
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
@ -2180,7 +2245,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
return;
free:
kfree(ri);
ri_free(ri);
}
/* Prepare to single-step probed instruction out of line. */
@ -2294,6 +2359,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
struct file *vm_file;
loff_t offset;
unsigned int seq;
guard(rcu)();
if (!mmap_lock_speculate_try_begin(mm, &seq))
return NULL;
vma = vma_lookup(mm, bp_vaddr);
if (!vma)
return NULL;
/*
* vm_file memory can be reused for another instance of struct file,
* but can't be freed from under us, so it's safe to read fields from
* it, even if the values are some garbage values; ultimately
* find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
* that whatever we speculatively found is correct
*/
vm_file = READ_ONCE(vma->vm_file);
if (!vm_file)
return NULL;
offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
if (!uprobe)
return NULL;
/* now double check that nothing about MM changed */
if (mmap_lock_speculate_retry(mm, seq))
return NULL;
return uprobe;
}
/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
@ -2301,10 +2407,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
uprobe = find_active_uprobe_speculative(bp_vaddr);
if (uprobe)
return uprobe;
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
if (valid_vma(vma, false)) {
if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
@ -2324,25 +2434,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
return uprobe;
}
static struct return_instance*
push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie)
static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
{
struct return_consumer *ric;
if (unlikely(ri == ZERO_SIZE_PTR))
return ri;
if (unlikely(idx >= ri->consumers_cnt)) {
struct return_instance *old_ri = ri;
ri->consumers_cnt += DEF_CNT;
ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL);
if (!ri) {
kfree(old_ri);
if (unlikely(ri->cons_cnt > 0)) {
ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
if (!ric) {
ri_free(ri);
return ZERO_SIZE_PTR;
}
ri->extra_consumers = ric;
}
ri->consumers[idx].id = id;
ri->consumers[idx].cookie = cookie;
ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
ric->id = id;
ric->cookie = cookie;
ri->cons_cnt++;
return ri;
}
@ -2350,14 +2462,17 @@ static struct return_consumer *
return_consumer_find(struct return_instance *ri, int *iter, int id)
{
struct return_consumer *ric;
int idx = *iter;
int idx;
for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) {
for (idx = *iter; idx < ri->cons_cnt; idx++)
{
ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
if (ric->id == id) {
*iter = idx + 1;
return ric;
}
}
return NULL;
}
@ -2371,9 +2486,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc;
bool has_consumers = false, remove = true;
struct return_instance *ri = NULL;
int push_idx = 0;
struct uprobe_task *utask = current->utask;
current->utask->auprobe = &uprobe->arch;
utask->auprobe = &uprobe->arch;
list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
bool session = uc->handler && uc->ret_handler;
@ -2393,21 +2508,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
continue;
if (!ri)
ri = alloc_return_instance();
ri = alloc_return_instance(utask);
if (session)
ri = push_consumer(ri, push_idx++, uc->id, cookie);
ri = push_consumer(ri, uc->id, cookie);
}
current->utask->auprobe = NULL;
utask->auprobe = NULL;
if (!ZERO_OR_NULL_PTR(ri)) {
/*
* The push_idx value has the final number of return consumers,
* and ri->consumers_cnt has number of allocated consumers.
*/
ri->consumers_cnt = push_idx;
if (!ZERO_OR_NULL_PTR(ri))
prepare_uretprobe(uprobe, regs, ri);
}
if (remove && has_consumers) {
down_read(&uprobe->register_rwsem);
@ -2461,7 +2570,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct return_instance *ri, *next;
struct return_instance *ri, *ri_next, *next_chain;
struct uprobe *uprobe;
enum hprobe_state hstate;
bool valid;
@ -2481,8 +2590,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack().
*/
next = find_next_ret_chain(ri);
valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
next_chain = find_next_ret_chain(ri);
valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
@ -2494,7 +2603,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* trampoline addresses on the stack are replaced with correct
* original return addresses
*/
rcu_assign_pointer(utask->return_instances, ri->next);
ri_next = ri->next;
rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
uprobe = hprobe_consume(&ri->hprobe, &hstate);
if (valid)
@ -2502,9 +2613,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
hprobe_finalize(&ri->hprobe, hstate);
/* We already took care of hprobe, no need to waste more time on that. */
ri = free_ret_instance(ri, false /* !cleanup_hprobe */);
utask->depth--;
} while (ri != next);
free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
ri = ri_next;
} while (ri != next_chain);
} while (!valid);
return;

View File

@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
return false;
init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1;
vma->vm_lock_seq = UINT_MAX;
return true;
}
@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
mm->mm_lock_seq = 0;
#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;

View File

@ -619,7 +619,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
u64 flags, struct perf_raw_record *raw,
struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
@ -644,6 +645,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_save_raw_data(sd, event, raw);
return perf_event_output(event, sd, regs);
}
@ -687,9 +690,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
preempt_enable();
@ -748,9 +750,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd);
ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
preempt_enable();

View File

@ -3360,8 +3360,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
return 0;
if (gup_flags & FOLL_PIN) {
seq = raw_read_seqcount(&current->mm->write_protect_seq);
if (seq & 1)
if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
return 0;
}

View File

@ -40,7 +40,7 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
.mm_lock_seq = 0,
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,

View File

@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
* begun. Linking to the tree will have caused this to be incremented,
* which means we will get a false positive otherwise.
*/
vma->vm_lock_seq = -1;
vma->vm_lock_seq = UINT_MAX;
return vma;
}
@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
int seq = vma->vm_lock_seq;
/* We reset after each check. */
vma->vm_lock_seq = -1;
vma->vm_lock_seq = UINT_MAX;
/* The vma_start_write() stub simply increments this value. */
return seq > -1;

View File

@ -241,7 +241,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
int vm_lock_seq;
unsigned int vm_lock_seq;
struct vma_lock *vm_lock;
#endif
@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
return false;
init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1;
vma->vm_lock_seq = UINT_MAX;
return true;
}