perf c2c: Add report option to show false sharing in adjacent cachelines

Many platforms have feature of adjacent cachelines prefetch, when it is
enabled, for data in RAM of 2 cachelines (2N and 2N+1) granularity, if
one is fetched to cache, the other one could likely be fetched too,
which sort of extends the cacheline size to double, thus the false
sharing could happens in adjacent cachelines.

0Day has captured performance changed related with this [1], and some
commercial software explicitly makes its hot global variables 128 bytes
aligned (2 cache lines) to avoid this kind of extended false sharing.

So add an option "--double-cl" for 'perf c2c report' to show false
sharing in double cache line granularity, which acts just like the
cacheline size is doubled. There is no change to c2c record. The
hardware events of shared cacheline are still per cacheline, and this
option just changes the granularity of how events are grouped and
displayed.

In the 'perf c2c report' output below (will-it-scale's 'pagefault2' case
on old kernel):

  ----------------------------------------------------------------------
     26       31        2        0        0        0  0xffff888103ec6000
  ----------------------------------------------------------------------
   35.48%   50.00%    0.00%    0.00%    0.00%   0x10     0       1  0xffffffff8133148b   1153   66    971   3748   74  [k] get_mem_cgroup_from_mm
    6.45%    0.00%    0.00%    0.00%    0.00%   0x10     0       1  0xffffffff813396e4    570    0   1531    879   75  [k] mem_cgroup_charge
   25.81%   50.00%    0.00%    0.00%    0.00%   0x54     0       1  0xffffffff81331472    949   70    593   3359   74  [k] get_mem_cgroup_from_mm
   19.35%    0.00%    0.00%    0.00%    0.00%   0x54     0       1  0xffffffff81339686   1352    0   1073   1022   74  [k] mem_cgroup_charge
    9.68%    0.00%    0.00%    0.00%    0.00%   0x54     0       1  0xffffffff813396d6   1401    0    863    768   74  [k] mem_cgroup_charge
    3.23%    0.00%    0.00%    0.00%    0.00%   0x54     0       1  0xffffffff81333106    618    0    804     11    9  [k] uncharge_batch

The offset 0x10 and 0x54 used to displayed in 2 groups, and now they are
listed together to give users a hint of extended false sharing.

[1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/

Committer notes:

Link: https://lore.kernel.org/r/Y+wvVNWqXb70l4uy@feng-clx

Removed -a, leaving just as --double-cl, as this probably is not used so
frequently and perhaps will be even auto-detected if we manage to record
the MSR where this is configured.

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Tested-by: Leo Yan <leo.yan@linaro.org>
Acked-by: Joe Mario <jmario@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com>
Link: https://lore.kernel.org/r/20230214075823.246414-1-feng.tang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Feng Tang 2023-02-14 15:58:23 +08:00 committed by Arnaldo Carvalho de Melo
parent 91621be65d
commit 1470a108a6
5 changed files with 49 additions and 17 deletions

View File

@ -130,6 +130,12 @@ REPORT OPTIONS
The known limitations include exception handing such as
setjmp/longjmp will have calls/returns not match.
--double-cl::
Group the detection of shared cacheline events into double cacheline
granularity. Some architectures have an Adjacent Cacheline Prefetch
feature, which causes cacheline sharing to behave like the cacheline
size is doubled.
C2C RECORD
----------
The perf c2c record command setup options related to HITM cacheline analysis

View File

@ -524,7 +524,7 @@ static int dcacheline_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
char buf[20];
if (he->mem_info)
addr = cl_address(he->mem_info->daddr.addr);
addr = cl_address(he->mem_info->daddr.addr, chk_double_cl);
return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr));
}
@ -562,7 +562,7 @@ static int offset_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
char buf[20];
if (he->mem_info)
addr = cl_offset(he->mem_info->daddr.al_addr);
addr = cl_offset(he->mem_info->daddr.al_addr, chk_double_cl);
return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr));
}
@ -574,9 +574,10 @@ offset_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
uint64_t l = 0, r = 0;
if (left->mem_info)
l = cl_offset(left->mem_info->daddr.addr);
l = cl_offset(left->mem_info->daddr.addr, chk_double_cl);
if (right->mem_info)
r = cl_offset(right->mem_info->daddr.addr);
r = cl_offset(right->mem_info->daddr.addr, chk_double_cl);
return (int64_t)(r - l);
}
@ -2590,7 +2591,7 @@ perf_c2c_cacheline_browser__title(struct hist_browser *browser,
he = cl_browser->he;
if (he->mem_info)
addr = cl_address(he->mem_info->daddr.addr);
addr = cl_address(he->mem_info->daddr.addr, chk_double_cl);
scnprintf(bf, size, "Cacheline 0x%lx", addr);
return 0;
@ -2788,15 +2789,16 @@ static int ui_quirks(void)
if (!c2c.use_stdio) {
dim_offset.width = 5;
dim_offset.header = header_offset_tui;
nodestr = "CL";
nodestr = chk_double_cl ? "Double-CL" : "CL";
}
dim_percent_costly_snoop.header = percent_costly_snoop_header[c2c.display];
/* Fix the zero line for dcacheline column. */
buf = fill_line("Cacheline", dim_dcacheline.width +
dim_dcacheline_node.width +
dim_dcacheline_count.width + 4);
buf = fill_line(chk_double_cl ? "Double-Cacheline" : "Cacheline",
dim_dcacheline.width +
dim_dcacheline_node.width +
dim_dcacheline_count.width + 4);
if (!buf)
return -ENOMEM;
@ -3037,6 +3039,7 @@ static int perf_c2c__report(int argc, const char **argv)
OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
OPT_BOOLEAN(0, "stitch-lbr", &c2c.stitch_lbr,
"Enable LBR callgraph stitching approach"),
OPT_BOOLEAN(0, "double-cl", &chk_double_cl, "Detect adjacent cacheline false sharing"),
OPT_PARENT(c2c_options),
OPT_END()
};

View File

@ -6,16 +6,31 @@
int __pure cacheline_size(void);
static inline u64 cl_address(u64 address)
/*
* Some architectures have 'Adjacent Cacheline Prefetch' feature,
* which performs like the cacheline size being doubled.
*/
static inline u64 cl_address(u64 address, bool double_cl)
{
u64 size = cacheline_size();
if (double_cl)
size *= 2;
/* return the cacheline of the address */
return (address & ~(cacheline_size() - 1));
return (address & ~(size - 1));
}
static inline u64 cl_offset(u64 address)
static inline u64 cl_offset(u64 address, bool double_cl)
{
/* return the cacheline of the address */
return (address & (cacheline_size() - 1));
u64 size = cacheline_size();
if (double_cl)
size *= 2;
/* return the offset inside cacheline */
return (address & (size - 1));
}
#endif // PERF_CACHELINE_H

View File

@ -53,6 +53,13 @@ enum sort_mode sort__mode = SORT_MODE__NORMAL;
static const char *const dynamic_headers[] = {"local_ins_lat", "ins_lat", "local_p_stage_cyc", "p_stage_cyc"};
static const char *const arch_specific_sort_keys[] = {"local_p_stage_cyc", "p_stage_cyc"};
/*
* Some architectures have Adjacent Cacheline Prefetch feature, which
* behaves like the cacheline size is doubled. Enable this flag to
* check things in double cacheline granularity.
*/
bool chk_double_cl;
/*
* Replaces all occurrences of a char used with the:
*
@ -1500,8 +1507,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
addr:
/* al_addr does all the right addr - start + offset calculations */
l = cl_address(left->mem_info->daddr.al_addr);
r = cl_address(right->mem_info->daddr.al_addr);
l = cl_address(left->mem_info->daddr.al_addr, chk_double_cl);
r = cl_address(right->mem_info->daddr.al_addr, chk_double_cl);
if (l > r) return -1;
if (l < r) return 1;
@ -1520,7 +1527,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
if (he->mem_info) {
struct map *map = he->mem_info->daddr.ms.map;
addr = cl_address(he->mem_info->daddr.al_addr);
addr = cl_address(he->mem_info->daddr.al_addr, chk_double_cl);
ms = &he->mem_info->daddr.ms;
/* print [s] for shared data mmaps */

View File

@ -35,6 +35,7 @@ extern struct sort_entry sort_sym_from;
extern struct sort_entry sort_sym_to;
extern struct sort_entry sort_srcline;
extern const char default_mem_sort_order[];
extern bool chk_double_cl;
struct res_sample {
u64 time;