Merge branch into tip/master: 'perf/core'

# New commits in perf/core:
    b709eb872e19 ("perf: map pages in advance")
    6d642735cdb6 ("perf/x86/intel/uncore: Support more units on Granite Rapids")
    3f710be02ea6 ("perf/x86/intel/uncore: Clean up func_id")
    0e45818ec189 ("perf/x86/intel: Support RDPMC metrics clear mode")
    02c56362a7d3 ("uprobes: Guard against kmemdup() failing in dup_return_instance()")
    d29e744c7167 ("perf/x86: Relax privilege filter restriction on AMD IBS")
    6057b90ecc84 ("perf/core: Export perf_exclude_event()")
    8622e45b5da1 ("uprobes: Reuse return_instances between multiple uretprobes within task")
    0cf981de7687 ("uprobes: Ensure return_instance is detached from the list before freeing")
    636666a1c733 ("uprobes: Decouple return_instance list traversal and freeing")
    2ff913ab3f47 ("uprobes: Simplify session consumer tracking")
    e0925f2dc4de ("uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution")
    83e3dc9a5d4d ("uprobes: simplify find_active_uprobe_rcu() VMA checks")
    03a001b156d2 ("mm: introduce mmap_lock_speculate_{try_begin|retry}")
    eb449bd96954 ("mm: convert mm_lock_seq to a proper seqcount")
    7528585290a1 ("mm/gup: Use raw_seqcount_try_begin()")
    96450ead1652 ("seqlock: add raw_seqcount_try_begin")
    b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs")
    54d2759778c1 ("perf/x86/rapl: Move the cntr_mask to rapl_pmus struct")
    bdc57ec70548 ("perf/x86/rapl: Remove the global variable rapl_msrs")
    abf03d9bd20c ("perf/x86/rapl: Modify the generic variable names to *_pkg*")
    eeca4c6b2529 ("perf/x86/rapl: Add arguments to the init and cleanup functions")
    cd29d83a6d81 ("perf/x86/rapl: Make rapl_model struct global")
    8bf1c86e5ac8 ("perf/x86/rapl: Rename rapl_pmu variables")
    1d5e2f637a94 ("perf/x86/rapl: Remove the cpu_to_rapl_pmu() function")
    e4b444347795 ("x86/topology: Introduce topology_logical_core_id()")
    2f2db347071a ("perf/x86/rapl: Remove the unused get_rapl_pmu_cpumask() function")
    ae55e308bde2 ("perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS")
    3c00ed344cef ("perf/x86/intel/ds: Factor out functions for PEBS records processing")
    7087bfb0adc9 ("perf/x86/intel/ds: Clarify adaptive PEBS processing")
    faac6f105ef1 ("perf/core: Check sample_type in perf_sample_save_brstack")
    f226805bc5f6 ("perf/core: Check sample_type in perf_sample_save_callchain")
    b9c44b91476b ("perf/core: Save raw sample data conditionally based on sample type")

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2025-01-11 17:05:00 +01:00
commit 40c78513bc
36 changed files with 958 additions and 499 deletions

View File

@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
"core_id." "core_id."
- topology_logical_core_id();
The logical core ID to which a thread belongs.
System topology examples System topology examples

View File

@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = cpuhw->usedss; raw.frag.size = cpuhw->usedss;
raw.frag.data = cpuhw->stop; raw.frag.data = cpuhw->stop;
perf_sample_save_raw_data(&data, &raw); perf_sample_save_raw_data(&data, event, &raw);
} }
overflow = perf_event_overflow(event, &data, &regs); overflow = perf_event_overflow(event, &data, &regs);

View File

@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
cpuhw->flags &= ~PMU_F_ENABLED; cpuhw->flags &= ~PMU_F_ENABLED;
} }
/* perf_exclude_event() - Filter event /* perf_event_exclude() - Filter event
* @event: The perf event * @event: The perf event
* @regs: pt_regs structure * @regs: pt_regs structure
* @sde_regs: Sample-data-entry (sde) regs structure * @sde_regs: Sample-data-entry (sde) regs structure
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
* *
* Return non-zero if the event shall be excluded. * Return non-zero if the event shall be excluded.
*/ */
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
struct perf_sf_sde_regs *sde_regs) struct perf_sf_sde_regs *sde_regs)
{ {
if (event->attr.exclude_user && user_mode(regs)) if (event->attr.exclude_user && user_mode(regs))
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
data.tid_entry.pid = basic->hpp & LPP_PID_MASK; data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
overflow = 0; overflow = 0;
if (perf_exclude_event(event, &regs, sde_regs)) if (perf_event_exclude(event, &regs, sde_regs))
goto out; goto out;
if (perf_event_overflow(event, &data, &regs)) { if (perf_event_overflow(event, &data, &regs)) {
overflow = 1; overflow = 1;

View File

@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize; raw.frag.size = rawsize;
raw.frag.data = cpump->save; raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw); perf_sample_save_raw_data(&data, event, &raw);
} }
overflow = perf_event_overflow(event, &data, &regs); overflow = perf_event_overflow(event, &data, &regs);

View File

@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw.frag.size = rawsize; raw.frag.size = rawsize;
raw.frag.data = cpump->save; raw.frag.data = cpump->save;
perf_sample_save_raw_data(&data, &raw); perf_sample_save_raw_data(&data, event, &raw);
} }
overflow = perf_event_overflow(event, &data, &regs); overflow = perf_event_overflow(event, &data, &regs);

View File

@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event)) if (!x86_perf_event_set_period(event))
continue; continue;
if (has_branch_stack(event)) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs)) if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0); x86_pmu_stop(event, 0);

View File

@ -31,6 +31,8 @@ static u32 ibs_caps;
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
/* attr.config2 */
#define IBS_SW_FILTER_MASK 1
/* /*
* IBS states: * IBS states:
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
if (has_branch_stack(event)) if (has_branch_stack(event))
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* handle exclude_{user,kernel} in the IRQ handler */
if (event->attr.exclude_host || event->attr.exclude_guest ||
event->attr.exclude_idle)
return -EINVAL;
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
(event->attr.exclude_kernel || event->attr.exclude_user ||
event->attr.exclude_hv))
return -EINVAL;
ret = validate_group(event); ret = validate_group(event);
if (ret) if (ret)
return ret; return ret;
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
NULL, NULL,
}; };
static struct attribute_group empty_format_group = {
.name = "format",
.attrs = attrs_empty,
};
static struct attribute_group empty_caps_group = { static struct attribute_group empty_caps_group = {
.name = "caps", .name = "caps",
.attrs = attrs_empty, .attrs = attrs_empty,
}; };
static const struct attribute_group *empty_attr_groups[] = {
&empty_format_group,
&empty_caps_group,
NULL,
};
PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_FORMAT_ATTR(cnt_ctl, "config:19");
PMU_FORMAT_ATTR(swfilt, "config2:0");
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
} }
static struct attribute *rand_en_attrs[] = { static struct attribute *fetch_attrs[] = {
&format_attr_rand_en.attr, &format_attr_rand_en.attr,
&format_attr_swfilt.attr,
NULL, NULL,
}; };
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
NULL, NULL,
}; };
static struct attribute_group group_rand_en = { static struct attribute_group group_fetch_formats = {
.name = "format", .name = "format",
.attrs = rand_en_attrs, .attrs = fetch_attrs,
}; };
static struct attribute_group group_fetch_l3missonly = { static struct attribute_group group_fetch_l3missonly = {
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
}; };
static const struct attribute_group *fetch_attr_groups[] = { static const struct attribute_group *fetch_attr_groups[] = {
&group_rand_en, &group_fetch_formats,
&empty_caps_group, &empty_caps_group,
NULL, NULL,
}; };
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
} }
static struct attribute *op_attrs[] = {
&format_attr_swfilt.attr,
NULL,
};
static struct attribute *cnt_ctl_attrs[] = { static struct attribute *cnt_ctl_attrs[] = {
&format_attr_cnt_ctl.attr, &format_attr_cnt_ctl.attr,
NULL, NULL,
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
NULL, NULL,
}; };
static struct attribute_group group_op_formats = {
.name = "format",
.attrs = op_attrs,
};
static struct attribute_group group_cnt_ctl = { static struct attribute_group group_cnt_ctl = {
.name = "format", .name = "format",
.attrs = cnt_ctl_attrs, .attrs = cnt_ctl_attrs,
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
.is_visible = zen4_ibs_extensions_is_visible, .is_visible = zen4_ibs_extensions_is_visible,
}; };
static const struct attribute_group *op_attr_groups[] = {
&group_op_formats,
&empty_caps_group,
NULL,
};
static const struct attribute_group *op_attr_update[] = { static const struct attribute_group *op_attr_update[] = {
&group_cnt_ctl, &group_cnt_ctl,
&group_op_l3missonly, &group_op_l3missonly,
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start, .start = perf_ibs_start,
.stop = perf_ibs_stop, .stop = perf_ibs_stop,
.read = perf_ibs_read, .read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
}, },
.msr = MSR_AMD64_IBSFETCHCTL, .msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK, .config_mask = IBS_FETCH_CONFIG_MASK,
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
.start = perf_ibs_start, .start = perf_ibs_start,
.stop = perf_ibs_stop, .stop = perf_ibs_stop,
.read = perf_ibs_read, .read = perf_ibs_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
}, },
.msr = MSR_AMD64_IBSOPCTL, .msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK, .config_mask = IBS_OP_CONFIG_MASK,
@ -1111,6 +1128,12 @@ fail:
regs.flags |= PERF_EFLAGS_EXACT; regs.flags |= PERF_EFLAGS_EXACT;
} }
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
perf_exclude_event(event, &regs)) {
throttle = perf_event_account_interrupt(event);
goto out;
}
if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (event->attr.sample_type & PERF_SAMPLE_RAW) {
raw = (struct perf_raw_record){ raw = (struct perf_raw_record){
.frag = { .frag = {
@ -1118,7 +1141,7 @@ fail:
.data = ibs_data.data, .data = ibs_data.data,
}, },
}; };
perf_sample_save_raw_data(&data, &raw); perf_sample_save_raw_data(&data, event, &raw);
} }
if (perf_ibs == &perf_ibs_op) if (perf_ibs == &perf_ibs_op)
@ -1129,8 +1152,7 @@ fail:
* recorded as part of interrupt regs. Thus we need to use rip from * recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack. * interrupt regs while unwinding call stack.
*/ */
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(&data, event, iregs);
perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs); throttle = perf_event_overflow(event, &data, &regs);
out: out:
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_ZEN4) if (ibs_caps & IBS_CAPS_ZEN4)
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_groups = op_attr_groups;
perf_ibs_op.pmu.attr_update = op_attr_update; perf_ibs_op.pmu.attr_update = op_attr_update;
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");

View File

@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period); perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event)) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs)) if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0); x86_pmu_stop(event, 0);

View File

@ -2826,6 +2826,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
return; return;
idx = INTEL_PMC_IDX_FIXED_SLOTS; idx = INTEL_PMC_IDX_FIXED_SLOTS;
if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
bits |= INTEL_FIXED_3_METRICS_CLEAR;
} }
intel_set_masks(event, idx); intel_set_masks(event, idx);
@ -4081,7 +4084,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
* is used in a metrics group, it too cannot support sampling. * is used in a metrics group, it too cannot support sampling.
*/ */
if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2) /* The metrics_clear can only be set for the slots event */
if (event->attr.config1 &&
(!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR)))
return -EINVAL;
if (event->attr.config2)
return -EINVAL; return -EINVAL;
/* /*
@ -4690,6 +4698,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" );
PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
static ssize_t umask2_show(struct device *dev, static ssize_t umask2_show(struct device *dev,
struct device_attribute *attr, struct device_attribute *attr,
char *page) char *page)
@ -4709,6 +4719,7 @@ static struct device_attribute format_attr_umask2 =
static struct attribute *format_evtsel_ext_attrs[] = { static struct attribute *format_evtsel_ext_attrs[] = {
&format_attr_umask2.attr, &format_attr_umask2.attr,
&format_attr_eq.attr, &format_attr_eq.attr,
&format_attr_metrics_clear.attr,
NULL NULL
}; };
@ -4733,6 +4744,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
if (i == 1) if (i == 1)
return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
/* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
if (i == 2) {
union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
return intel_cap.rdpmc_metrics_clear ? attr->mode : 0;
}
return 0; return 0;
} }

View File

@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and * previous PMI context or an (I)RET happened between the record and
* PMI. * PMI.
*/ */
if (sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
/* /*
* We use the interrupt regs as a base because the PEBS record does not * We use the interrupt regs as a base because the PEBS record does not
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 3) if (x86_pmu.intel_cap.pebs_format >= 3)
setup_pebs_time(event, data, pebs->tsc); setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event)) perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
} }
static void adaptive_pebs_save_regs(struct pt_regs *regs, static void adaptive_pebs_save_regs(struct pt_regs *regs,
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
} }
#define PEBS_LATENCY_MASK 0xffff #define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
#define PEBS_RETIRE_LATENCY_OFFSET 32
/* /*
* With adaptive PEBS the layout depends on what fields are configured. * With adaptive PEBS the layout depends on what fields are configured.
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_basic *basic = __pebs; struct pebs_basic *basic = __pebs;
void *next_record = basic + 1; void *next_record = basic + 1;
u64 sample_type; u64 sample_type, format_group;
u64 format_size;
struct pebs_meminfo *meminfo = NULL; struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL; struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs; struct x86_perf_regs *perf_regs;
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs->xmm_regs = NULL; perf_regs->xmm_regs = NULL;
sample_type = event->attr.sample_type; sample_type = event->attr.sample_type;
format_size = basic->format_size; format_group = basic->format_group;
perf_sample_data_init(data, 0, event->hw.last_period); perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period; data->period = event->hw.last_period;
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and * previous PMI context or an (I)RET happened between the record and
* PMI. * PMI.
*/ */
if (sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, iregs);
perf_sample_save_callchain(data, event, iregs);
*regs = *iregs; *regs = *iregs;
/* The ip in basic is EventingIP */ /* The ip in basic is EventingIP */
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; data->weight.var3_w = basic->retire_latency;
else else
data->weight.var3_w = 0; data->weight.var3_w = 0;
} }
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* But PERF_SAMPLE_TRANSACTION needs gprs->ax. * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
* Save the pointer here but process later. * Save the pointer here but process later.
*/ */
if (format_size & PEBS_DATACFG_MEMINFO) { if (format_group & PEBS_DATACFG_MEMINFO) {
meminfo = next_record; meminfo = next_record;
next_record = meminfo + 1; next_record = meminfo + 1;
} }
if (format_size & PEBS_DATACFG_GP) { if (format_group & PEBS_DATACFG_GP) {
gprs = next_record; gprs = next_record;
next_record = gprs + 1; next_record = gprs + 1;
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
adaptive_pebs_save_regs(regs, gprs); adaptive_pebs_save_regs(regs, gprs);
} }
if (format_size & PEBS_DATACFG_MEMINFO) { if (format_group & PEBS_DATACFG_MEMINFO) {
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 weight = meminfo->latency; u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
data->weight.var2_w = weight & PEBS_LATENCY_MASK; data->weight.var2_w = meminfo->instr_latency;
weight >>= PEBS_CACHE_LATENCY_OFFSET;
}
/* /*
* Although meminfo::latency is defined as a u64, * Although meminfo::latency is defined as a u64,
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* in practice on Ice Lake and earlier platforms. * in practice on Ice Lake and earlier platforms.
*/ */
if (sample_type & PERF_SAMPLE_WEIGHT) { if (sample_type & PERF_SAMPLE_WEIGHT) {
data->weight.full = weight ?: data->weight.full = latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning); intel_get_tsx_weight(meminfo->tsx_tuning);
} else { } else {
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: data->weight.var1_dw = (u32)latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning); intel_get_tsx_weight(meminfo->tsx_tuning);
} }
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
} }
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
} }
} }
if (format_size & PEBS_DATACFG_XMMS) { if (format_group & PEBS_DATACFG_XMMS) {
struct pebs_xmm *xmm = next_record; struct pebs_xmm *xmm = next_record;
next_record = xmm + 1; next_record = xmm + 1;
perf_regs->xmm_regs = xmm->xmm; perf_regs->xmm_regs = xmm->xmm;
} }
if (format_size & PEBS_DATACFG_LBRS) { if (format_group & PEBS_DATACFG_LBRS) {
struct lbr_entry *lbr = next_record; struct lbr_entry *lbr = next_record;
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
& 0xff) + 1; & 0xff) + 1;
next_record = next_record + num_lbr * sizeof(struct lbr_entry); next_record = next_record + num_lbr * sizeof(struct lbr_entry);
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
} }
} }
WARN_ONCE(next_record != __pebs + (format_size >> 48), WARN_ONCE(next_record != __pebs + basic->format_size,
"PEBS record size %llu, expected %llu, config %llx\n", "PEBS record size %u, expected %llu, config %llx\n",
format_size >> 48, basic->format_size,
(u64)(next_record - __pebs), (u64)(next_record - __pebs),
basic->format_size); format_group);
} }
static inline void * static inline void *
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0; return 0;
} }
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
struct perf_sample_data *, struct pt_regs *);
static struct pt_regs dummy_iregs;
static __always_inline void static __always_inline void
__intel_pmu_pebs_event(struct perf_event *event, __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data, struct perf_sample_data *data,
void *base, void *top, void *at,
int bit, int count, setup_fn setup_sample)
void (*setup_sample)(struct perf_event *, {
struct pt_regs *, setup_sample(event, iregs, at, data, regs);
void *, perf_event_output(event, data, regs);
struct perf_sample_data *, }
struct pt_regs *))
static __always_inline void
__intel_pmu_pebs_last_event(struct perf_event *event,
struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *at,
int count,
setup_fn setup_sample)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw; struct hw_perf_event *hwc = &event->hw;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else if (!intel_pmu_save_and_restart(event))
return;
if (!iregs)
iregs = &dummy_iregs;
while (count > 1) {
setup_sample(event, iregs, at, data, regs);
perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
setup_sample(event, iregs, at, data, regs); setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) { if (iregs == &dummy_iregs) {
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
if (perf_event_overflow(event, data, regs)) if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0); x86_pmu_stop(event, 0);
} }
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
* Now, auto-reload is only enabled in fixed period mode.
* The reload value is always hwc->sample_period.
* May need to change it, if auto-reload is enabled in
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
} else
intel_pmu_save_and_restart(event);
}
static __always_inline void
__intel_pmu_pebs_events(struct perf_event *event,
struct pt_regs *iregs,
struct perf_sample_data *data,
void *base, void *top,
int bit, int count,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
int cnt = count;
if (!iregs)
iregs = &dummy_iregs;
while (cnt > 1) {
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
cnt--;
}
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
} }
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
return; return;
} }
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data); setup_pebs_fixed_sample_data);
} }
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
} }
if (counts[bit]) { if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, data, base, __intel_pmu_pebs_events(event, iregs, data, base,
top, bit, counts[bit], top, bit, counts[bit],
setup_pebs_fixed_sample_data); setup_pebs_fixed_sample_data);
} }
} }
} }
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{ {
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds; struct debug_store *ds = cpuc->ds;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
struct perf_event *event; struct perf_event *event;
void *base, *at, *top; void *base, *at, *top;
int bit; int bit;
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
return; return;
} }
for (at = base; at < top; at += cpuc->pebs_record_size) { if (!iregs)
iregs = &dummy_iregs;
/* Process all but the last event for each counter. */
for (at = base; at < top; at += basic->format_size) {
u64 pebs_status; u64 pebs_status;
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; basic = at;
pebs_status &= mask; if (basic->format_size != cpuc->pebs_record_size)
continue;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
counts[bit]++; for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
setup_pebs_adaptive_sample_data);
}
last[bit] = at;
}
} }
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0) if (!counts[bit])
continue; continue;
event = cpuc->events[bit]; event = cpuc->events[bit];
if (WARN_ON_ONCE(!event))
continue;
if (WARN_ON_ONCE(!event->attr.precise_ip)) __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
continue; counts[bit], setup_pebs_adaptive_sample_data);
__intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
} }
} }

View File

@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event); pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */ /* no device found for this pmu */
if (pmu->func_id < 0) if (!pmu->registered)
return -ENOENT; return -ENOENT;
/* Sampling not supported yet */ /* Sampling not supported yet */
@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types)
uncore_type_exit(*types); uncore_type_exit(*types);
} }
static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) static int __init uncore_type_init(struct intel_uncore_type *type)
{ {
struct intel_uncore_pmu *pmus; struct intel_uncore_pmu *pmus;
size_t size; size_t size;
@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
size = uncore_max_dies() * sizeof(struct intel_uncore_box *); size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
for (i = 0; i < type->num_boxes; i++) { for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
pmus[i].pmu_idx = i; pmus[i].pmu_idx = i;
pmus[i].type = type; pmus[i].type = type;
pmus[i].boxes = kzalloc(size, GFP_KERNEL); pmus[i].boxes = kzalloc(size, GFP_KERNEL);
@ -1055,12 +1054,12 @@ err:
} }
static int __init static int __init
uncore_types_init(struct intel_uncore_type **types, bool setid) uncore_types_init(struct intel_uncore_type **types)
{ {
int ret; int ret;
for (; *types; types++) { for (; *types; types++) {
ret = uncore_type_init(*types, setid); ret = uncore_type_init(*types);
if (ret) if (ret)
return ret; return ret;
} }
@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
if (!box) if (!box)
return -ENOMEM; return -ENOMEM;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt); atomic_inc(&box->refcnt);
box->dieid = die; box->dieid = die;
box->pci_dev = pdev; box->pci_dev = pdev;
@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void)
goto err; goto err;
} }
ret = uncore_types_init(uncore_pci_uncores, false); ret = uncore_types_init(uncore_pci_uncores);
if (ret) if (ret)
goto errtype; goto errtype;
@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void)
{ {
int ret; int ret;
ret = uncore_types_init(uncore_msr_uncores, true); ret = uncore_types_init(uncore_msr_uncores);
if (ret) if (ret)
goto err; goto err;
@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void)
struct intel_uncore_type **types = uncore_mmio_uncores; struct intel_uncore_type **types = uncore_mmio_uncores;
int ret; int ret;
ret = uncore_types_init(types, true); ret = uncore_types_init(types);
if (ret) if (ret)
goto err; goto err;

View File

@ -125,7 +125,6 @@ struct intel_uncore_pmu {
struct pmu pmu; struct pmu pmu;
char name[UNCORE_PMU_NAME_LEN]; char name[UNCORE_PMU_NAME_LEN];
int pmu_idx; int pmu_idx;
int func_id;
bool registered; bool registered;
atomic_t activeboxes; atomic_t activeboxes;
cpumask_t cpu_mask; cpumask_t cpu_mask;

View File

@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
pmu = uncore_event_to_pmu(event); pmu = uncore_event_to_pmu(event);
/* no device found for this pmu */ /* no device found for this pmu */
if (pmu->func_id < 0) if (!pmu->registered)
return -ENOENT; return -ENOENT;
/* Sampling not supported yet */ /* Sampling not supported yet */

View File

@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void)
/* GNR uncore support */ /* GNR uncore support */
#define UNCORE_GNR_NUM_UNCORE_TYPES 23 #define UNCORE_GNR_NUM_UNCORE_TYPES 23
#define UNCORE_GNR_TYPE_15 15
#define UNCORE_GNR_B2UPI 18
#define UNCORE_GNR_TYPE_21 21
#define UNCORE_GNR_TYPE_22 22
int gnr_uncore_units_ignore[] = { int gnr_uncore_units_ignore[] = {
UNCORE_SPR_UPI,
UNCORE_GNR_TYPE_15,
UNCORE_GNR_B2UPI,
UNCORE_GNR_TYPE_21,
UNCORE_GNR_TYPE_22,
UNCORE_IGNORE_END UNCORE_IGNORE_END
}; };
@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = {
.attr_update = uncore_alias_groups, .attr_update = uncore_alias_groups,
}; };
static struct intel_uncore_type gnr_uncore_pciex8 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex8",
};
static struct intel_uncore_type gnr_uncore_pciex16 = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "pciex16",
};
static struct intel_uncore_type gnr_uncore_upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "upi",
};
static struct intel_uncore_type gnr_uncore_b2upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2upi",
};
static struct intel_uncore_type gnr_uncore_b2hot = {
.name = "b2hot",
.attr_update = uncore_alias_groups,
};
static struct intel_uncore_type gnr_uncore_b2cmi = { static struct intel_uncore_type gnr_uncore_b2cmi = {
SPR_UNCORE_PCI_COMMON_FORMAT(), SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "b2cmi", .name = "b2cmi",
@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
&gnr_uncore_ubox, &gnr_uncore_ubox,
&spr_uncore_imc, &spr_uncore_imc,
NULL, NULL,
&gnr_uncore_upi,
NULL, NULL,
NULL, NULL,
NULL, NULL,
&spr_uncore_cxlcm,
&spr_uncore_cxldp,
NULL, NULL,
NULL, &gnr_uncore_b2hot,
NULL,
NULL,
NULL,
&gnr_uncore_b2cmi, &gnr_uncore_b2cmi,
&gnr_uncore_b2cxl, &gnr_uncore_b2cxl,
NULL, &gnr_uncore_b2upi,
NULL, NULL,
&gnr_uncore_mdf_sbo, &gnr_uncore_mdf_sbo,
NULL, &gnr_uncore_pciex16,
NULL, &gnr_uncore_pciex8,
}; };
static struct freerunning_counters gnr_iio_freerunning[] = { static struct freerunning_counters gnr_iio_freerunning[] = {

View File

@ -624,6 +624,7 @@ union perf_capabilities {
u64 pebs_output_pt_available:1; u64 pebs_output_pt_available:1;
u64 pebs_timing_info:1; u64 pebs_timing_info:1;
u64 anythread_deprecated:1; u64 anythread_deprecated:1;
u64 rdpmc_metrics_clear:1;
}; };
u64 capabilities; u64 capabilities;
}; };

View File

@ -39,6 +39,10 @@
* event: rapl_energy_psys * event: rapl_energy_psys
* perf code: 0x5 * perf code: 0x5
* *
* core counter: consumption of a single physical core
* event: rapl_energy_core (power_core PMU)
* perf code: 0x1
*
* We manage those counters as free running (read-only). They may be * We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat. * use simultaneously by other tools, such as turbostat.
* *
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
/* /*
* RAPL energy status counters * RAPL energy status counters
*/ */
enum perf_rapl_events { enum perf_rapl_pkg_events {
PERF_RAPL_PP0 = 0, /* all cores */ PERF_RAPL_PP0 = 0, /* all cores */
PERF_RAPL_PKG, /* entire package */ PERF_RAPL_PKG, /* entire package */
PERF_RAPL_RAM, /* DRAM */ PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */ PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */ PERF_RAPL_PSYS, /* psys */
PERF_RAPL_MAX, PERF_RAPL_PKG_EVENTS_MAX,
NR_RAPL_DOMAINS = PERF_RAPL_MAX, NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
}; };
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { #define PERF_RAPL_CORE 0 /* single core */
#define PERF_RAPL_CORE_EVENTS_MAX 1
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
"pp0-core", "pp0-core",
"package", "package",
"dram", "dram",
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"psys", "psys",
}; };
static const char *const rapl_core_domain_name __initconst = "core";
/* /*
* event code: LSB 8 bits, passed in attr->config * event code: LSB 8 bits, passed in attr->config
* any other bit is reserved * any other bit is reserved
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
* considered as either pkg-scope or die-scope, and we are considering * considered as either pkg-scope or die-scope, and we are considering
* them as die-scope. * them as die-scope.
*/ */
#define rapl_pmu_is_pkg_scope() \ #define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@ -129,7 +139,8 @@ struct rapl_pmu {
struct rapl_pmus { struct rapl_pmus {
struct pmu pmu; struct pmu pmu;
unsigned int nr_rapl_pmu; unsigned int nr_rapl_pmu;
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); unsigned int cntr_mask;
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
}; };
enum rapl_unit_quirk { enum rapl_unit_quirk {
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
}; };
struct rapl_model { struct rapl_model {
struct perf_msr *rapl_msrs; struct perf_msr *rapl_pkg_msrs;
unsigned long events; struct perf_msr *rapl_core_msrs;
unsigned long pkg_events;
unsigned long core_events;
unsigned int msr_power_unit; unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk; enum rapl_unit_quirk unit_quirk;
}; };
/* 1/2^hw_unit Joule */ /* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus; static int rapl_core_hw_unit __read_mostly;
static unsigned int rapl_cntr_mask; static struct rapl_pmus *rapl_pmus_pkg;
static struct rapl_pmus *rapl_pmus_core;
static u64 rapl_timer_ms; static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs; static struct rapl_model *rapl_model;
/* /*
* Helper functions to get the correct topology macros according to the * Helper function to get the correct topology id according to the
* RAPL PMU scope. * RAPL PMU scope.
*/ */
static inline unsigned int get_rapl_pmu_idx(int cpu) static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
{ {
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
{
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
topology_die_cpumask(cpu);
}
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/* /*
* The unsigned check also catches the '-1' return value for non * Returns unsigned int, which converts the '-1' return value
* existent mappings in the topology map. * (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/ */
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; switch (scope) {
case PERF_PMU_SCOPE_PKG:
return topology_logical_package_id(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_logical_die_id(cpu);
case PERF_PMU_SCOPE_CORE:
return topology_logical_core_id(cpu);
default:
return -EINVAL;
}
} }
static inline u64 rapl_read_counter(struct perf_event *event) static inline u64 rapl_read_counter(struct perf_event *event)
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw; return raw;
} }
static inline u64 rapl_scale(u64 v, int cfg) static inline u64 rapl_scale(u64 v, struct perf_event *event)
{ {
if (cfg > NR_RAPL_DOMAINS) { int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v; if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
} hw_unit = rapl_core_hw_unit;
/* /*
* scale delta to smallest unit (1/2^32) * scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules * users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32). * or use ldexp(count, -32).
* Watts = Joules/Time delta * Watts = Joules/Time delta
*/ */
return v << (32 - rapl_hw_unit[cfg - 1]); return v << (32 - hw_unit);
} }
static u64 rapl_event_update(struct perf_event *event) static u64 rapl_event_update(struct perf_event *event)
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
delta = (new_raw_count << shift) - (prev_raw_count << shift); delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift; delta >>= shift;
sdelta = rapl_scale(delta, event->hw.config); sdelta = rapl_scale(delta, event);
local64_add(sdelta, &event->count); local64_add(sdelta, &event->count);
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{ {
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event; struct perf_event *event;
unsigned long flags; unsigned long flags;
if (!pmu->n_active) if (!rapl_pmu->n_active)
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
raw_spin_lock_irqsave(&pmu->lock, flags); raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
list_for_each_entry(event, &pmu->active_list, active_entry) list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event); rapl_event_update(event);
raw_spin_unlock_irqrestore(&pmu->lock, flags); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval); hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART; return HRTIMER_RESTART;
} }
static void rapl_hrtimer_init(struct rapl_pmu *pmu) static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{ {
struct hrtimer *hr = &pmu->hrtimer; struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle; hr->function = rapl_hrtimer_handle;
} }
static void __rapl_pmu_event_start(struct rapl_pmu *pmu, static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event) struct perf_event *event)
{ {
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
event->hw.state = 0; event->hw.state = 0;
list_add_tail(&event->active_entry, &pmu->active_list); list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event)); local64_set(&event->hw.prev_count, rapl_read_counter(event));
pmu->n_active++; rapl_pmu->n_active++;
if (pmu->n_active == 1) if (rapl_pmu->n_active == 1)
rapl_start_hrtimer(pmu); rapl_start_hrtimer(rapl_pmu);
} }
static void rapl_pmu_event_start(struct perf_event *event, int mode) static void rapl_pmu_event_start(struct perf_event *event, int mode)
{ {
struct rapl_pmu *pmu = event->pmu_private; struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags); raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
__rapl_pmu_event_start(pmu, event); __rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
} }
static void rapl_pmu_event_stop(struct perf_event *event, int mode) static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{ {
struct rapl_pmu *pmu = event->pmu_private; struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw; struct hw_perf_event *hwc = &event->hw;
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags); raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */ /* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) { if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0); WARN_ON_ONCE(rapl_pmu->n_active <= 0);
pmu->n_active--; rapl_pmu->n_active--;
if (pmu->n_active == 0) if (rapl_pmu->n_active == 0)
hrtimer_cancel(&pmu->hrtimer); hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry); list_del(&event->active_entry);
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE; hwc->state |= PERF_HES_UPTODATE;
} }
raw_spin_unlock_irqrestore(&pmu->lock, flags); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
} }
static int rapl_pmu_event_add(struct perf_event *event, int mode) static int rapl_pmu_event_add(struct perf_event *event, int mode)
{ {
struct rapl_pmu *pmu = event->pmu_private; struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw; struct hw_perf_event *hwc = &event->hw;
unsigned long flags; unsigned long flags;
raw_spin_lock_irqsave(&pmu->lock, flags); raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START) if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event); __rapl_pmu_event_start(rapl_pmu, event);
raw_spin_unlock_irqrestore(&pmu->lock, flags); raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0; return 0;
} }
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
static int rapl_pmu_event_init(struct perf_event *event) static int rapl_pmu_event_init(struct perf_event *event)
{ {
u64 cfg = event->attr.config & RAPL_EVENT_MASK; u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0; int bit, rapl_pmus_scope, ret = 0;
struct rapl_pmu *pmu; struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
struct rapl_pmus *rapl_pmus;
/* only look at RAPL events */ /* unsupported modes and filters */
if (event->attr.type != rapl_pmus->pmu.type) if (event->attr.sample_period) /* no sampling */
return -ENOENT; return -EINVAL;
/* check only supported bits are set */ /* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK) if (event->attr.config & ~RAPL_EVENT_MASK)
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0) if (event->cpu < 0)
return -EINVAL; return -EINVAL;
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
if (!rapl_pmus)
return -EINVAL; return -EINVAL;
rapl_pmus_scope = rapl_pmus->pmu.scope;
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
bit = cfg - 1; /* only look at RAPL package events */
if (event->attr.type != rapl_pmus_pkg->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
/* only look at RAPL core events */
if (event->attr.type != rapl_pmus_core->pmu.type)
return -ENOENT;
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
bit = cfg - 1;
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
} else
return -EINVAL;
/* check event supported */ /* check event supported */
if (!(rapl_cntr_mask & (1 << bit))) if (!(rapl_pmus->cntr_mask & (1 << bit)))
return -EINVAL; return -EINVAL;
/* unsupported modes and filters */ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
if (event->attr.sample_period) /* no sampling */ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
return -EINVAL; return -EINVAL;
/* must be done before validate_group */ /* must be done before validate_group */
pmu = cpu_to_rapl_pmu(event->cpu); rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
if (!pmu) if (!rapl_pmu)
return -EINVAL; return -EINVAL;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr; event->pmu_private = rapl_pmu;
event->hw.config = cfg; event->hw.config = cfg;
event->hw.idx = bit; event->hw.idx = bit;
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
/* /*
* we compute in 0.23 nJ increments regardless of MSR * we compute in 0.23 nJ increments regardless of MSR
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
/* /*
* There are no default events, but we need to create * There are no default events, but we need to create
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
NULL, NULL,
}; };
static const struct attribute_group *rapl_core_attr_groups[] = {
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
};
static struct attribute *rapl_events_cores[] = { static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores), EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit), EVENT_PTR(rapl_cores_unit),
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
.attrs = rapl_events_psys, .attrs = rapl_events_psys,
}; };
static struct attribute *rapl_events_core[] = {
EVENT_PTR(rapl_core),
EVENT_PTR(rapl_core_unit),
EVENT_PTR(rapl_core_scale),
NULL,
};
static struct attribute_group rapl_events_core_group = {
.name = "events",
.attrs = rapl_events_core,
};
static bool test_msr(int idx, void *data) static bool test_msr(int idx, void *data)
{ {
return test_bit(idx, (unsigned long *) data); return test_bit(idx, (unsigned long *) data);
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
}; };
/* /*
* Force to PERF_RAPL_MAX size due to: * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
* - perf_msr_probe(PERF_RAPL_MAX) * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
* - want to use same event codes across both architectures * - want to use same event codes across both architectures
*/ */
static struct perf_msr amd_rapl_msrs[] = { static struct perf_msr amd_rapl_pkg_msrs[] = {
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
}; };
static int rapl_check_hw_unit(struct rapl_model *rm) static struct perf_msr amd_rapl_core_msrs[] = {
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
test_msr, false, RAPL_MSR_MASK },
};
static int rapl_check_hw_unit(void)
{ {
u64 msr_rapl_power_unit_bits; u64 msr_rapl_power_unit_bits;
int i; int i;
/* protect rdmsrl() to handle virtualization */ /* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1; return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++) for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rm->unit_quirk) { rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rapl_model->unit_quirk) {
/* /*
* DRAM domain on HSW server and KNL has fixed energy unit which can be * DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See * different than the unit from power unit MSR. See
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 " * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/ */
case RAPL_UNIT_QUIRK_INTEL_HSW: case RAPL_UNIT_QUIRK_INTEL_HSW:
rapl_hw_unit[PERF_RAPL_RAM] = 16; rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
break; break;
/* SPR uses a fixed energy unit for Psys domain. */ /* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR: case RAPL_UNIT_QUIRK_INTEL_SPR:
rapl_hw_unit[PERF_RAPL_PSYS] = 0; rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
break; break;
default: default:
break; break;
} }
/* /*
* Calculate the timer rate: * Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter * Use reference of 200W for scaling the timeout to avoid counter
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
* if hw unit is 32, then we use 2 ms 1/200/2 * if hw unit is 32, then we use 2 ms 1/200/2
*/ */
rapl_timer_ms = 2; rapl_timer_ms = 2;
if (rapl_hw_unit[0] < 32) { if (rapl_pkg_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100)); rapl_timer_ms = (1000 / (2 * 100));
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
} }
return 0; return 0;
} }
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
static void __init rapl_advertise(void) static void __init rapl_advertise(void)
{ {
int i; int i;
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
if (rapl_pmus_core)
num_counters += hweight32(rapl_pmus_core->cntr_mask);
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
hweight32(rapl_cntr_mask), rapl_timer_ms); num_counters, rapl_timer_ms);
for (i = 0; i < NR_RAPL_DOMAINS; i++) { for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
if (rapl_cntr_mask & (1 << i)) { if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n", pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_domain_names[i], rapl_hw_unit[i]); rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
} }
} }
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_core_domain_name, rapl_core_hw_unit);
} }
static void cleanup_rapl_pmus(void) static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{ {
int i; int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
kfree(rapl_pmus->pmus[i]); kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus); kfree(rapl_pmus);
} }
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
NULL, NULL,
}; };
static int __init init_rapl_pmu(void) static const struct attribute_group *rapl_core_attr_update[] = {
&rapl_events_core_group,
NULL,
};
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{ {
struct rapl_pmu *pmu; struct rapl_pmu *rapl_pmu;
int idx; int idx;
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
if (!pmu) if (!rapl_pmu)
goto free; goto free;
raw_spin_lock_init(&pmu->lock); raw_spin_lock_init(&rapl_pmu->lock);
INIT_LIST_HEAD(&pmu->active_list); INIT_LIST_HEAD(&rapl_pmu->active_list);
pmu->pmu = &rapl_pmus->pmu; rapl_pmu->pmu = &rapl_pmus->pmu;
pmu->timer_interval = ms_to_ktime(rapl_timer_ms); rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu); rapl_hrtimer_init(rapl_pmu);
rapl_pmus->pmus[idx] = pmu; rapl_pmus->rapl_pmu[idx] = rapl_pmu;
} }
return 0; return 0;
free: free:
for (; idx > 0; idx--) for (; idx > 0; idx--)
kfree(rapl_pmus->pmus[idx - 1]); kfree(rapl_pmus->rapl_pmu[idx - 1]);
return -ENOMEM; return -ENOMEM;
} }
static int __init init_rapl_pmus(void) static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
const struct attribute_group **rapl_attr_groups,
const struct attribute_group **rapl_attr_update)
{ {
int nr_rapl_pmu = topology_max_packages(); int nr_rapl_pmu = topology_max_packages();
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; struct rapl_pmus *rapl_pmus;
if (!rapl_pmu_is_pkg_scope()) { /*
nr_rapl_pmu *= topology_max_dies_per_package(); * rapl_pmu_scope must be either PKG, DIE or CORE
rapl_pmu_scope = PERF_PMU_SCOPE_DIE; */
} if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu *= topology_max_dies_per_package();
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
nr_rapl_pmu *= topology_num_cores_per_package();
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
return -EINVAL;
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus) if (!rapl_pmus)
return -ENOMEM; return -ENOMEM;
*rapl_pmus_ptr = rapl_pmus;
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups; rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update; rapl_pmus->pmu.attr_update = rapl_attr_update;
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
return init_rapl_pmu(); return init_rapl_pmu(rapl_pmus);
} }
static struct rapl_model model_snb = { static struct rapl_model model_snb = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_PP1), BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_snbep = { static struct rapl_model model_snbep = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM), BIT(PERF_RAPL_RAM),
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_hsw = { static struct rapl_model model_hsw = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1), BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_hsx = { static struct rapl_model model_hsx = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM), BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_knl = { static struct rapl_model model_knl = {
.events = BIT(PERF_RAPL_PKG) | .pkg_events = BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM), BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_skl = { static struct rapl_model model_skl = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PP1) |
BIT(PERF_RAPL_PSYS), BIT(PERF_RAPL_PSYS),
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_msrs, .rapl_pkg_msrs = intel_rapl_msrs,
}; };
static struct rapl_model model_spr = { static struct rapl_model model_spr = {
.events = BIT(PERF_RAPL_PP0) | .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PSYS), BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT, .msr_power_unit = MSR_RAPL_POWER_UNIT,
.rapl_msrs = intel_rapl_spr_msrs, .rapl_pkg_msrs = intel_rapl_spr_msrs,
}; };
static struct rapl_model model_amd_hygon = { static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG), .pkg_events = BIT(PERF_RAPL_PKG),
.core_events = BIT(PERF_RAPL_CORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs, .rapl_pkg_msrs = amd_rapl_pkg_msrs,
.rapl_core_msrs = amd_rapl_core_msrs,
}; };
static const struct x86_cpu_id rapl_model_match[] __initconst = { static const struct x86_cpu_id rapl_model_match[] __initconst = {
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
static int __init rapl_pmu_init(void) static int __init rapl_pmu_init(void)
{ {
const struct x86_cpu_id *id; const struct x86_cpu_id *id;
struct rapl_model *rm; int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret; int ret;
if (rapl_pkg_pmu_is_pkg_scope())
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
id = x86_match_cpu(rapl_model_match); id = x86_match_cpu(rapl_model_match);
if (!id) if (!id)
return -ENODEV; return -ENODEV;
rm = (struct rapl_model *) id->driver_data; rapl_model = (struct rapl_model *) id->driver_data;
rapl_msrs = rm->rapl_msrs; ret = rapl_check_hw_unit();
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
false, (void *) &rm->events);
ret = rapl_check_hw_unit(rm);
if (ret) if (ret)
return ret; return ret;
ret = init_rapl_pmus(); ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
rapl_attr_update);
if (ret) if (ret)
return ret; return ret;
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
PERF_RAPL_PKG_EVENTS_MAX, false,
(void *) &rapl_model->pkg_events);
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
if (ret) if (ret)
goto out; goto out;
if (rapl_model->core_events) {
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
rapl_core_attr_groups,
rapl_core_attr_update);
if (ret) {
pr_warn("power-core PMU initialization failed (%d)\n", ret);
goto core_init_failed;
}
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
PERF_RAPL_CORE_EVENTS_MAX, false,
(void *) &rapl_model->core_events);
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
if (ret) {
pr_warn("power-core PMU registration failed (%d)\n", ret);
cleanup_rapl_pmus(rapl_pmus_core);
}
}
core_init_failed:
rapl_advertise(); rapl_advertise();
return 0; return 0;
out: out:
pr_warn("Initialization failed (%d), disabled\n", ret); pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus(); cleanup_rapl_pmus(rapl_pmus_pkg);
return ret; return ret;
} }
module_init(rapl_pmu_init); module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void) static void __exit intel_rapl_exit(void)
{ {
perf_pmu_unregister(&rapl_pmus->pmu); if (rapl_pmus_core) {
cleanup_rapl_pmus(); perf_pmu_unregister(&rapl_pmus_core->pmu);
cleanup_rapl_pmus(rapl_pmus_core);
}
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
cleanup_rapl_pmus(rapl_pmus_pkg);
} }
module_exit(intel_rapl_exit); module_exit(intel_rapl_exit);

View File

@ -41,6 +41,7 @@
#define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_USER (1ULL << 1)
#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2)
#define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX (1ULL << 32)
#define HSW_IN_TX_CHECKPOINTED (1ULL << 33) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code)
#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
#define INTEL_TD_METRIC_NUM 8 #define INTEL_TD_METRIC_NUM 8
#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0
#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT)
static inline bool is_metric_idx(int idx) static inline bool is_metric_idx(int idx)
{ {
return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM;
@ -422,7 +426,9 @@ static inline bool is_topdown_idx(int idx)
*/ */
struct pebs_basic { struct pebs_basic {
u64 format_size; u64 format_group:32,
retire_latency:16,
format_size:16;
u64 ip; u64 ip;
u64 applicable_counters; u64 applicable_counters;
u64 tsc; u64 tsc;
@ -431,7 +437,17 @@ struct pebs_basic {
struct pebs_meminfo { struct pebs_meminfo {
u64 address; u64 address;
u64 aux; u64 aux;
u64 latency; union {
/* pre Alder Lake */
u64 mem_latency;
/* Alder Lake and later */
struct {
u64 instr_latency:16;
u64 pad2:16;
u64 cache_latency:16;
u64 pad3:16;
};
};
u64 tsx_tuning; u64 tsx_tuning;
}; };

View File

@ -98,6 +98,7 @@ struct cpuinfo_topology {
// Logical ID mappings // Logical ID mappings
u32 logical_pkg_id; u32 logical_pkg_id;
u32 logical_die_id; u32 logical_die_id;
u32 logical_core_id;
// AMD Node ID and Nodes per Package info // AMD Node ID and Nodes per Package info
u32 amd_node_id; u32 amd_node_id;

View File

@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin) #define topology_ppin(cpu) (cpu_data(cpu).ppin)

View File

@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);

View File

@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
if (!early) { if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
} }
/* Package relative core ID */ /* Package relative core ID */

View File

@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we * we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below. * need ordering is below.
*/ */
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
return false; return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked. * after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all(). * This pairs with RELEASE semantics in vma_end_write_all().
*/ */
if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock); up_read(&vma->vm_lock->lock);
return false; return false;
} }
@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
} }
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{ {
mmap_assert_write_locked(vma->vm_mm); mmap_assert_write_locked(vma->vm_mm);
@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and * current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified. * mm->mm_lock_seq can't be concurrently modified.
*/ */
*mm_lock_seq = vma->vm_mm->mm_lock_seq; *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
return (vma->vm_lock_seq == *mm_lock_seq); return (vma->vm_lock_seq == *mm_lock_seq);
} }
@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
*/ */
static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_start_write(struct vm_area_struct *vma)
{ {
int mm_lock_seq; unsigned int mm_lock_seq;
if (__is_vma_write_locked(vma, &mm_lock_seq)) if (__is_vma_write_locked(vma, &mm_lock_seq))
return; return;
@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{ {
int mm_lock_seq; unsigned int mm_lock_seq;
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
} }

View File

@ -727,7 +727,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the * counter reuse can only lead to occasional unnecessary use of the
* slowpath. * slowpath.
*/ */
int vm_lock_seq; unsigned int vm_lock_seq;
/* Unstable RCU readers are allowed to read this. */ /* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock; struct vma_lock *vm_lock;
#endif #endif
@ -921,6 +921,9 @@ struct mm_struct {
* Roughly speaking, incrementing the sequence number is * Roughly speaking, incrementing the sequence number is
* equivalent to releasing locks on VMAs; reading the sequence * equivalent to releasing locks on VMAs; reading the sequence
* number can be part of taking a read lock on a VMA. * number can be part of taking a read lock on a VMA.
* Incremented every time mmap_lock is write-locked/unlocked.
* Initialized to 0, therefore odd values indicate mmap_lock
* is write-locked and even values that it's released.
* *
* Can be modified under write mmap_lock using RELEASE * Can be modified under write mmap_lock using RELEASE
* semantics. * semantics.
@ -929,7 +932,7 @@ struct mm_struct {
* Can be read with ACQUIRE semantics if not holding write * Can be read with ACQUIRE semantics if not holding write
* mmap_lock. * mmap_lock.
*/ */
int mm_lock_seq; seqcount_t mm_lock_seq;
#endif #endif

View File

@ -71,6 +71,91 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
} }
#ifdef CONFIG_PER_VMA_LOCK #ifdef CONFIG_PER_VMA_LOCK
static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
seqcount_init(&mm->mm_lock_seq);
}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
{
do_raw_write_seqcount_begin(&mm->mm_lock_seq);
}
static inline void mm_lock_seqcount_end(struct mm_struct *mm)
{
ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
do_raw_write_seqcount_end(&mm->mm_lock_seq);
}
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
/*
* Since mmap_lock is a sleeping lock, and waiting for it to become
* unlocked is more or less equivalent with taking it ourselves, don't
* bother with the speculative path if mmap_lock is already write-locked
* and take the slow path, which takes the lock.
*/
return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
}
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
return read_seqcount_retry(&mm->mm_lock_seq, seq);
}
#else /* CONFIG_PER_VMA_LOCK */
static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
return false;
}
static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
return true;
}
#endif /* CONFIG_PER_VMA_LOCK */
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
mm_lock_seqcount_init(mm);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
if (!ret)
mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
}
/* /*
* Drop all currently-held per-VMA locks. * Drop all currently-held per-VMA locks.
* This is called from the mmap_lock implementation directly before releasing * This is called from the mmap_lock implementation directly before releasing
@ -82,46 +167,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
static inline void vma_end_write_all(struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm)
{ {
mmap_assert_write_locked(mm); mmap_assert_write_locked(mm);
/* mm_lock_seqcount_end(mm);
* Nobody can concurrently modify mm->mm_lock_seq due to exclusive
* mmap_lock being held.
* We need RELEASE semantics here to ensure that preceding stores into
* the VMA take effect before we unlock it with this store.
* Pairs with ACQUIRE semantics in vma_start_read().
*/
smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
#endif
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
} }
static inline void mmap_write_unlock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm)

View File

@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
{ {
int size = 1; int size = 1;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
return;
data->callchain = perf_callchain(event, regs); data->callchain = perf_callchain(event, regs);
size += data->callchain->nr; size += data->callchain->nr;
@ -1287,12 +1292,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data,
} }
static inline void perf_sample_save_raw_data(struct perf_sample_data *data, static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
struct perf_event *event,
struct perf_raw_record *raw) struct perf_raw_record *raw)
{ {
struct perf_raw_frag *frag = &raw->frag; struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0; u32 sum = 0;
int size; int size;
if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
return;
do { do {
sum += frag->size; sum += frag->size;
if (perf_raw_frag_last(frag)) if (perf_raw_frag_last(frag))
@ -1309,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
data->sample_flags |= PERF_SAMPLE_RAW; data->sample_flags |= PERF_SAMPLE_RAW;
} }
static inline bool has_branch_stack(struct perf_event *event)
{
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}
static inline void perf_sample_save_brstack(struct perf_sample_data *data, static inline void perf_sample_save_brstack(struct perf_sample_data *data,
struct perf_event *event, struct perf_event *event,
struct perf_branch_stack *brs, struct perf_branch_stack *brs,
@ -1316,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
{ {
int size = sizeof(u64); /* nr */ int size = sizeof(u64); /* nr */
if (!has_branch_stack(event))
return;
if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
return;
if (branch_sample_hw_index(event)) if (branch_sample_hw_index(event))
size += sizeof(u64); size += sizeof(u64);
size += brs->nr * sizeof(struct perf_branch_entry); size += brs->nr * sizeof(struct perf_branch_entry);
@ -1669,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
} }
extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);
extern void perf_event_init(void); extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record, extern void perf_tp_event(u16 event_type, u64 count, void *record,
int entry_size, struct pt_regs *regs, int entry_size, struct pt_regs *regs,
@ -1705,11 +1728,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
#endif #endif
static inline bool has_branch_stack(struct perf_event *event)
{
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}
static inline bool needs_branch_stack(struct perf_event *event) static inline bool needs_branch_stack(struct perf_event *event)
{ {
return event->attr.branch_sample_type != 0; return event->attr.branch_sample_type != 0;
@ -1879,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{ {
return 0; return 0;
} }
static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
return 0;
}
#endif #endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)

View File

@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
__seq; \ __seq; \
}) })
/**
* raw_seqcount_try_begin() - begin a seqcount_t read critical section
* w/o lockdep and w/o counter stabilization
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Similar to raw_seqcount_begin(), except it enables eliding the critical
* section entirely if odd, instead of doing the speculation knowing it will
* fail.
*
* Useful when counter stabilization is more or less equivalent to taking
* the lock and there is a slowpath that does that.
*
* If true, start will be set to the (even) sequence count read.
*
* Return: true when a read critical section is started.
*/
#define raw_seqcount_try_begin(s, start) \
({ \
start = raw_read_seqcount(s); \
!(start & 1); \
})
/** /**
* raw_seqcount_begin() - begin a seqcount_t read critical section w/o * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
* lockdep and w/o counter stabilization * lockdep and w/o counter stabilization

View File

@ -16,6 +16,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/seqlock.h>
struct uprobe; struct uprobe;
struct vm_area_struct; struct vm_area_struct;
@ -124,6 +125,10 @@ struct uprobe_task {
unsigned int depth; unsigned int depth;
struct return_instance *return_instances; struct return_instance *return_instances;
struct return_instance *ri_pool;
struct timer_list ri_timer;
seqcount_t ri_seqcount;
union { union {
struct { struct {
struct arch_uprobe_task autask; struct arch_uprobe_task autask;
@ -137,7 +142,6 @@ struct uprobe_task {
}; };
struct uprobe *active_uprobe; struct uprobe *active_uprobe;
struct timer_list ri_timer;
unsigned long xol_vaddr; unsigned long xol_vaddr;
struct arch_uprobe *auprobe; struct arch_uprobe *auprobe;
@ -154,12 +158,18 @@ struct return_instance {
unsigned long stack; /* stack pointer */ unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */ unsigned long orig_ret_vaddr; /* original return address */
bool chained; /* true, if instance is nested */ bool chained; /* true, if instance is nested */
int consumers_cnt; int cons_cnt; /* total number of session consumers */
struct return_instance *next; /* keep as stack */ struct return_instance *next; /* keep as stack */
struct rcu_head rcu; struct rcu_head rcu;
struct return_consumer consumers[] __counted_by(consumers_cnt); /* singular pre-allocated return_consumer instance for common case */
struct return_consumer consumer;
/*
* extra return_consumer instances for rare cases of multiple session consumers,
* contains (cons_cnt - 1) elements
*/
struct return_consumer *extra_consumers;
} ____cacheline_aligned; } ____cacheline_aligned;
enum rp_check { enum rp_check {

View File

@ -6277,41 +6277,6 @@ unlock:
} }
EXPORT_SYMBOL_GPL(perf_event_update_userpage); EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
if (vmf->pgoff == 0)
ret = 0;
return ret;
}
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (!rb)
goto unlock;
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
goto unlock;
vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
if (!vmf->page)
goto unlock;
get_page(vmf->page);
vmf->page->mapping = vmf->vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
unlock:
rcu_read_unlock();
return ret;
}
static void ring_buffer_attach(struct perf_event *event, static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *rb) struct perf_buffer *rb)
{ {
@ -6551,13 +6516,87 @@ out_put:
ring_buffer_put(rb); /* could be last */ ring_buffer_put(rb); /* could be last */
} }
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{
/* The first page is the user control page, others are read-only. */
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
static const struct vm_operations_struct perf_mmap_vmops = { static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open, .open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */ .close = perf_mmap_close, /* non mergeable */
.fault = perf_mmap_fault, .pfn_mkwrite = perf_mmap_pfn_mkwrite,
.page_mkwrite = perf_mmap_fault,
}; };
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
unsigned long nr_pages = vma_pages(vma);
int err = 0;
unsigned long pagenum;
/*
* We map this as a VM_PFNMAP VMA.
*
* This is not ideal as this is designed broadly for mappings of PFNs
* referencing memory-mapped I/O ranges or non-system RAM i.e. for which
* !pfn_valid(pfn).
*
* We are mapping kernel-allocated memory (memory we manage ourselves)
* which would more ideally be mapped using vm_insert_page() or a
* similar mechanism, that is as a VM_MIXEDMAP mapping.
*
* However this won't work here, because:
*
* 1. It uses vma->vm_page_prot, but this field has not been completely
* setup at the point of the f_op->mmp() hook, so we are unable to
* indicate that this should be mapped CoW in order that the
* mkwrite() hook can be invoked to make the first page R/W and the
* rest R/O as desired.
*
* 2. Anything other than a VM_PFNMAP of valid PFNs will result in
* vm_normal_page() returning a struct page * pointer, which means
* vm_ops->page_mkwrite() will be invoked rather than
* vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
* to work around retry logic in the fault handler, however this
* field is no longer allowed to be used within struct page.
*
* 3. Having a struct page * made available in the fault logic also
* means that the page gets put on the rmap and becomes
* inappropriately accessible and subject to map and ref counting.
*
* Ideally we would have a mechanism that could explicitly express our
* desires, but this is not currently the case, so we instead use
* VM_PFNMAP.
*
* We manage the lifetime of these mappings with internal refcounts (see
* perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
* this mapping is maintained correctly.
*/
for (pagenum = 0; pagenum < nr_pages; pagenum++) {
unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
if (page == NULL) {
err = -EINVAL;
break;
}
/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
if (err)
break;
}
#ifdef CONFIG_MMU
/* Clear any partial mappings on error. */
if (err)
zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
#endif
return err;
}
static int perf_mmap(struct file *file, struct vm_area_struct *vma) static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{ {
struct perf_event *event = file->private_data; struct perf_event *event = file->private_data;
@ -6682,6 +6721,8 @@ again:
goto again; goto again;
} }
/* We need the rb to map pages. */
rb = event->rb;
goto unlock; goto unlock;
} }
@ -6776,6 +6817,9 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops; vma->vm_ops = &perf_mmap_vmops;
if (!ret)
ret = map_range(rb, vma);
if (event->pmu->event_mapped) if (event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm); event->pmu->event_mapped(event, vma->vm_mm);
@ -10039,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, data, regs); perf_swevent_overflow(event, 0, data, regs);
} }
static int perf_exclude_event(struct perf_event *event, int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
struct pt_regs *regs)
{ {
if (event->hw.state & PERF_HES_STOPPED) if (event->hw.state & PERF_HES_STOPPED)
return 1; return 1;
@ -10425,9 +10468,9 @@ static struct pmu perf_tracepoint = {
}; };
static int perf_tp_filter_match(struct perf_event *event, static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data) struct perf_raw_record *raw)
{ {
void *record = data->raw->frag.data; void *record = raw->frag.data;
/* only top level events have filters set */ /* only top level events have filters set */
if (event->parent) if (event->parent)
@ -10439,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event,
} }
static int perf_tp_event_match(struct perf_event *event, static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data, struct perf_raw_record *raw,
struct pt_regs *regs) struct pt_regs *regs)
{ {
if (event->hw.state & PERF_HES_STOPPED) if (event->hw.state & PERF_HES_STOPPED)
@ -10450,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->attr.exclude_kernel && !user_mode(regs)) if (event->attr.exclude_kernel && !user_mode(regs))
return 0; return 0;
if (!perf_tp_filter_match(event, data)) if (!perf_tp_filter_match(event, raw))
return 0; return 0;
return 1; return 1;
@ -10476,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
static void __perf_tp_event_target_task(u64 count, void *record, static void __perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs, struct pt_regs *regs,
struct perf_sample_data *data, struct perf_sample_data *data,
struct perf_raw_record *raw,
struct perf_event *event) struct perf_event *event)
{ {
struct trace_entry *entry = record; struct trace_entry *entry = record;
@ -10485,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record,
/* Cannot deliver synchronous signal to other task. */ /* Cannot deliver synchronous signal to other task. */
if (event->attr.sigtrap) if (event->attr.sigtrap)
return; return;
if (perf_tp_event_match(event, data, regs)) if (perf_tp_event_match(event, raw, regs)) {
perf_sample_data_init(data, 0, 0);
perf_sample_save_raw_data(data, event, raw);
perf_swevent_event(event, count, data, regs); perf_swevent_event(event, count, data, regs);
}
} }
static void perf_tp_event_target_task(u64 count, void *record, static void perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs, struct pt_regs *regs,
struct perf_sample_data *data, struct perf_sample_data *data,
struct perf_raw_record *raw,
struct perf_event_context *ctx) struct perf_event_context *ctx)
{ {
unsigned int cpu = smp_processor_id(); unsigned int cpu = smp_processor_id();
@ -10499,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
struct perf_event *event, *sibling; struct perf_event *event, *sibling;
perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
__perf_tp_event_target_task(count, record, regs, data, event); __perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event) for_each_sibling_event(sibling, event)
__perf_tp_event_target_task(count, record, regs, data, sibling); __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
} }
perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
__perf_tp_event_target_task(count, record, regs, data, event); __perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event) for_each_sibling_event(sibling, event)
__perf_tp_event_target_task(count, record, regs, data, sibling); __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
} }
} }
@ -10525,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
}, },
}; };
perf_sample_data_init(&data, 0, 0);
perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type); perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) { hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs)) { if (perf_tp_event_match(event, &raw, regs)) {
perf_swevent_event(event, count, &data, regs);
/* /*
* Here use the same on-stack perf_sample_data, * Here use the same on-stack perf_sample_data,
* some members in data are event-specific and * some members in data are event-specific and
@ -10543,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
* because data->sample_flags is set. * because data->sample_flags is set.
*/ */
perf_sample_data_init(&data, 0, 0); perf_sample_data_init(&data, 0, 0);
perf_sample_save_raw_data(&data, &raw); perf_sample_save_raw_data(&data, event, &raw);
perf_swevent_event(event, count, &data, regs);
} }
} }
@ -10560,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
goto unlock; goto unlock;
raw_spin_lock(&ctx->lock); raw_spin_lock(&ctx->lock);
perf_tp_event_target_task(count, record, regs, &data, ctx); perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
raw_spin_unlock(&ctx->lock); raw_spin_unlock(&ctx->lock);
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();

View File

@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
struct page *page = virt_to_page(rb->aux_pages[idx]); struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page); ClearPagePrivate(page);
page->mapping = NULL;
__free_page(page); __free_page(page);
} }
@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr)
{ {
struct page *page = virt_to_page(addr); struct page *page = virt_to_page(addr);
page->mapping = NULL;
__free_page(page); __free_page(page);
} }
@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
} }
static void perf_mmap_unmark_page(void *addr)
{
struct page *page = vmalloc_to_page(addr);
page->mapping = NULL;
}
static void rb_free_work(struct work_struct *work) static void rb_free_work(struct work_struct *work)
{ {
struct perf_buffer *rb; struct perf_buffer *rb;
void *base;
int i, nr;
rb = container_of(work, struct perf_buffer, work); rb = container_of(work, struct perf_buffer, work);
nr = data_page_nr(rb);
base = rb->user_page; vfree(rb->user_page);
/* The '<=' counts in the user page. */
for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
kfree(rb); kfree(rb);
} }

View File

@ -1888,9 +1888,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs); return instruction_pointer(regs);
} }
static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
{ {
struct return_instance *next = ri->next; ri->cons_cnt = 0;
ri->next = utask->ri_pool;
utask->ri_pool = ri;
}
static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
{
struct return_instance *ri = utask->ri_pool;
if (likely(ri))
utask->ri_pool = ri->next;
return ri;
}
static void ri_free(struct return_instance *ri)
{
kfree(ri->extra_consumers);
kfree_rcu(ri, rcu);
}
static void free_ret_instance(struct uprobe_task *utask,
struct return_instance *ri, bool cleanup_hprobe)
{
unsigned seq;
if (cleanup_hprobe) { if (cleanup_hprobe) {
enum hprobe_state hstate; enum hprobe_state hstate;
@ -1899,8 +1923,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
hprobe_finalize(&ri->hprobe, hstate); hprobe_finalize(&ri->hprobe, hstate);
} }
kfree_rcu(ri, rcu); /*
return next; * At this point return_instance is unlinked from utask's
* return_instances list and this has become visible to ri_timer().
* If seqcount now indicates that ri_timer's return instance
* processing loop isn't active, we can return ri into the pool of
* to-be-reused return instances for future uretprobes. If ri_timer()
* happens to be running right now, though, we fallback to safety and
* just perform RCU-delated freeing of ri.
*/
if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
/* immediate reuse of ri without RCU GP is OK */
ri_pool_push(utask, ri);
} else {
/* we might be racing with ri_timer(), so play it safe */
ri_free(ri);
}
} }
/* /*
@ -1910,7 +1948,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
void uprobe_free_utask(struct task_struct *t) void uprobe_free_utask(struct task_struct *t)
{ {
struct uprobe_task *utask = t->utask; struct uprobe_task *utask = t->utask;
struct return_instance *ri; struct return_instance *ri, *ri_next;
if (!utask) if (!utask)
return; return;
@ -1921,8 +1959,19 @@ void uprobe_free_utask(struct task_struct *t)
timer_delete_sync(&utask->ri_timer); timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances; ri = utask->return_instances;
while (ri) while (ri) {
ri = free_ret_instance(ri, true /* cleanup_hprobe */); ri_next = ri->next;
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
ri = ri_next;
}
/* free_ret_instance() above might add to ri_pool, so this loop should come last */
ri = utask->ri_pool;
while (ri) {
ri_next = ri->next;
ri_free(ri);
ri = ri_next;
}
kfree(utask); kfree(utask);
} }
@ -1942,8 +1991,12 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */ /* RCU protects return_instance from freeing. */
guard(rcu)(); guard(rcu)();
write_seqcount_begin(&utask->ri_seqcount);
for_each_ret_instance_rcu(ri, utask->return_instances) for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false); hprobe_expire(&ri->hprobe, false);
write_seqcount_end(&utask->ri_seqcount);
} }
static struct uprobe_task *alloc_utask(void) static struct uprobe_task *alloc_utask(void)
@ -1955,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void)
return NULL; return NULL;
timer_setup(&utask->ri_timer, ri_timer, 0); timer_setup(&utask->ri_timer, ri_timer, 0);
seqcount_init(&utask->ri_seqcount);
return utask; return utask;
} }
@ -1974,32 +2028,40 @@ static struct uprobe_task *get_utask(void)
return current->utask; return current->utask;
} }
static size_t ri_size(int consumers_cnt) static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
{ {
struct return_instance *ri; struct return_instance *ri;
return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; ri = ri_pool_pop(utask);
} if (ri)
return ri;
#define DEF_CNT 4 ri = kzalloc(sizeof(*ri), GFP_KERNEL);
static struct return_instance *alloc_return_instance(void)
{
struct return_instance *ri;
ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL);
if (!ri) if (!ri)
return ZERO_SIZE_PTR; return ZERO_SIZE_PTR;
ri->consumers_cnt = DEF_CNT;
return ri; return ri;
} }
static struct return_instance *dup_return_instance(struct return_instance *old) static struct return_instance *dup_return_instance(struct return_instance *old)
{ {
size_t size = ri_size(old->consumers_cnt); struct return_instance *ri;
return kmemdup(old, size, GFP_KERNEL); ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
if (!ri)
return NULL;
if (unlikely(old->cons_cnt > 1)) {
ri->extra_consumers = kmemdup(old->extra_consumers,
sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
GFP_KERNEL);
if (!ri->extra_consumers) {
kfree(ri);
return NULL;
}
}
return ri;
} }
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
@ -2108,14 +2170,17 @@ unsigned long uprobe_get_trampoline_vaddr(void)
static void cleanup_return_instances(struct uprobe_task *utask, bool chained, static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs) struct pt_regs *regs)
{ {
struct return_instance *ri = utask->return_instances; struct return_instance *ri = utask->return_instances, *ri_next;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
ri = free_ret_instance(ri, true /* cleanup_hprobe */); ri_next = ri->next;
rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--; utask->depth--;
free_ret_instance(utask, ri, true /* cleanup_hprobe */);
ri = ri_next;
} }
rcu_assign_pointer(utask->return_instances, ri);
} }
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
@ -2180,7 +2245,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
return; return;
free: free:
kfree(ri); ri_free(ri);
} }
/* Prepare to single-step probed instruction out of line. */ /* Prepare to single-step probed instruction out of line. */
@ -2294,6 +2359,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode); return is_trap_insn(&opcode);
} }
static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
struct file *vm_file;
loff_t offset;
unsigned int seq;
guard(rcu)();
if (!mmap_lock_speculate_try_begin(mm, &seq))
return NULL;
vma = vma_lookup(mm, bp_vaddr);
if (!vma)
return NULL;
/*
* vm_file memory can be reused for another instance of struct file,
* but can't be freed from under us, so it's safe to read fields from
* it, even if the values are some garbage values; ultimately
* find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
* that whatever we speculatively found is correct
*/
vm_file = READ_ONCE(vma->vm_file);
if (!vm_file)
return NULL;
offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
if (!uprobe)
return NULL;
/* now double check that nothing about MM changed */
if (mmap_lock_speculate_retry(mm, seq))
return NULL;
return uprobe;
}
/* assumes being inside RCU protected region */ /* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{ {
@ -2301,10 +2407,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
struct uprobe *uprobe = NULL; struct uprobe *uprobe = NULL;
struct vm_area_struct *vma; struct vm_area_struct *vma;
uprobe = find_active_uprobe_speculative(bp_vaddr);
if (uprobe)
return uprobe;
mmap_read_lock(mm); mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr); vma = vma_lookup(mm, bp_vaddr);
if (vma) { if (vma) {
if (valid_vma(vma, false)) { if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file); struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr); loff_t offset = vaddr_to_offset(vma, bp_vaddr);
@ -2324,25 +2434,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
return uprobe; return uprobe;
} }
static struct return_instance* static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie)
{ {
struct return_consumer *ric;
if (unlikely(ri == ZERO_SIZE_PTR)) if (unlikely(ri == ZERO_SIZE_PTR))
return ri; return ri;
if (unlikely(idx >= ri->consumers_cnt)) { if (unlikely(ri->cons_cnt > 0)) {
struct return_instance *old_ri = ri; ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
if (!ric) {
ri->consumers_cnt += DEF_CNT; ri_free(ri);
ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL);
if (!ri) {
kfree(old_ri);
return ZERO_SIZE_PTR; return ZERO_SIZE_PTR;
} }
ri->extra_consumers = ric;
} }
ri->consumers[idx].id = id; ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
ri->consumers[idx].cookie = cookie; ric->id = id;
ric->cookie = cookie;
ri->cons_cnt++;
return ri; return ri;
} }
@ -2350,14 +2462,17 @@ static struct return_consumer *
return_consumer_find(struct return_instance *ri, int *iter, int id) return_consumer_find(struct return_instance *ri, int *iter, int id)
{ {
struct return_consumer *ric; struct return_consumer *ric;
int idx = *iter; int idx;
for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { for (idx = *iter; idx < ri->cons_cnt; idx++)
{
ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
if (ric->id == id) { if (ric->id == id) {
*iter = idx + 1; *iter = idx + 1;
return ric; return ric;
} }
} }
return NULL; return NULL;
} }
@ -2371,9 +2486,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc; struct uprobe_consumer *uc;
bool has_consumers = false, remove = true; bool has_consumers = false, remove = true;
struct return_instance *ri = NULL; struct return_instance *ri = NULL;
int push_idx = 0; struct uprobe_task *utask = current->utask;
current->utask->auprobe = &uprobe->arch; utask->auprobe = &uprobe->arch;
list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
bool session = uc->handler && uc->ret_handler; bool session = uc->handler && uc->ret_handler;
@ -2393,21 +2508,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
continue; continue;
if (!ri) if (!ri)
ri = alloc_return_instance(); ri = alloc_return_instance(utask);
if (session) if (session)
ri = push_consumer(ri, push_idx++, uc->id, cookie); ri = push_consumer(ri, uc->id, cookie);
} }
current->utask->auprobe = NULL; utask->auprobe = NULL;
if (!ZERO_OR_NULL_PTR(ri)) { if (!ZERO_OR_NULL_PTR(ri))
/*
* The push_idx value has the final number of return consumers,
* and ri->consumers_cnt has number of allocated consumers.
*/
ri->consumers_cnt = push_idx;
prepare_uretprobe(uprobe, regs, ri); prepare_uretprobe(uprobe, regs, ri);
}
if (remove && has_consumers) { if (remove && has_consumers) {
down_read(&uprobe->register_rwsem); down_read(&uprobe->register_rwsem);
@ -2461,7 +2570,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
void uprobe_handle_trampoline(struct pt_regs *regs) void uprobe_handle_trampoline(struct pt_regs *regs)
{ {
struct uprobe_task *utask; struct uprobe_task *utask;
struct return_instance *ri, *next; struct return_instance *ri, *ri_next, *next_chain;
struct uprobe *uprobe; struct uprobe *uprobe;
enum hprobe_state hstate; enum hprobe_state hstate;
bool valid; bool valid;
@ -2481,8 +2590,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* or NULL; the latter case means that nobody but ri->func * or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack(). * could hit this trampoline on return. TODO: sigaltstack().
*/ */
next = find_next_ret_chain(ri); next_chain = find_next_ret_chain(ri);
valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr); instruction_pointer_set(regs, ri->orig_ret_vaddr);
do { do {
@ -2494,7 +2603,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* trampoline addresses on the stack are replaced with correct * trampoline addresses on the stack are replaced with correct
* original return addresses * original return addresses
*/ */
rcu_assign_pointer(utask->return_instances, ri->next); ri_next = ri->next;
rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
uprobe = hprobe_consume(&ri->hprobe, &hstate); uprobe = hprobe_consume(&ri->hprobe, &hstate);
if (valid) if (valid)
@ -2502,9 +2613,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
hprobe_finalize(&ri->hprobe, hstate); hprobe_finalize(&ri->hprobe, hstate);
/* We already took care of hprobe, no need to waste more time on that. */ /* We already took care of hprobe, no need to waste more time on that. */
ri = free_ret_instance(ri, false /* !cleanup_hprobe */); free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
utask->depth--; ri = ri_next;
} while (ri != next); } while (ri != next_chain);
} while (!valid); } while (!valid);
return; return;

View File

@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
return false; return false;
init_rwsem(&vma->vm_lock->lock); init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1; vma->vm_lock_seq = UINT_MAX;
return true; return true;
} }
@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq); seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm); mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist); INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
mm->mm_lock_seq = 0;
#endif
mm_pgtables_bytes_init(mm); mm_pgtables_bytes_init(mm);
mm->map_count = 0; mm->map_count = 0;
mm->locked_vm = 0; mm->locked_vm = 0;

View File

@ -619,7 +619,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
static __always_inline u64 static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd) u64 flags, struct perf_raw_record *raw,
struct perf_sample_data *sd)
{ {
struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id(); unsigned int cpu = smp_processor_id();
@ -644,6 +645,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu)) if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP; return -EOPNOTSUPP;
perf_sample_save_raw_data(sd, event, raw);
return perf_event_output(event, sd, regs); return perf_event_output(event, sd, regs);
} }
@ -687,9 +690,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
} }
perf_sample_data_init(sd, 0, 0); perf_sample_data_init(sd, 0, 0);
perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd); err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out: out:
this_cpu_dec(bpf_trace_nest_level); this_cpu_dec(bpf_trace_nest_level);
preempt_enable(); preempt_enable();
@ -748,9 +750,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs); perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0); perf_sample_data_init(sd, 0, 0);
perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd); ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out: out:
this_cpu_dec(bpf_event_output_nest_level); this_cpu_dec(bpf_event_output_nest_level);
preempt_enable(); preempt_enable();

View File

@ -3360,8 +3360,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
return 0; return 0;
if (gup_flags & FOLL_PIN) { if (gup_flags & FOLL_PIN) {
seq = raw_read_seqcount(&current->mm->write_protect_seq); if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
if (seq & 1)
return 0; return 0;
} }

View File

@ -40,7 +40,7 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist), .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK #ifdef CONFIG_PER_VMA_LOCK
.mm_lock_seq = 0, .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif #endif
.user_ns = &init_user_ns, .user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE, .cpu_bitmap = CPU_BITS_NONE,

View File

@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
* begun. Linking to the tree will have caused this to be incremented, * begun. Linking to the tree will have caused this to be incremented,
* which means we will get a false positive otherwise. * which means we will get a false positive otherwise.
*/ */
vma->vm_lock_seq = -1; vma->vm_lock_seq = UINT_MAX;
return vma; return vma;
} }
@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
int seq = vma->vm_lock_seq; int seq = vma->vm_lock_seq;
/* We reset after each check. */ /* We reset after each check. */
vma->vm_lock_seq = -1; vma->vm_lock_seq = UINT_MAX;
/* The vma_start_write() stub simply increments this value. */ /* The vma_start_write() stub simply increments this value. */
return seq > -1; return seq > -1;

View File

@ -241,7 +241,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the * counter reuse can only lead to occasional unnecessary use of the
* slowpath. * slowpath.
*/ */
int vm_lock_seq; unsigned int vm_lock_seq;
struct vma_lock *vm_lock; struct vma_lock *vm_lock;
#endif #endif
@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
return false; return false;
init_rwsem(&vma->vm_lock->lock); init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1; vma->vm_lock_seq = UINT_MAX;
return true; return true;
} }