2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* include/linux/topology.h
|
|
|
|
*
|
|
|
|
* Written by: Matthew Dobson, IBM Corporation
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002, IBM Corp.
|
|
|
|
*
|
2008-01-25 20:08:20 +00:00
|
|
|
* All rights reserved.
|
2005-04-16 22:20:36 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful, but
|
|
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
|
|
* NON INFRINGEMENT. See the GNU General Public License for more
|
|
|
|
* details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
*
|
|
|
|
* Send feedback to <colpatch@us.ibm.com>
|
|
|
|
*/
|
|
|
|
#ifndef _LINUX_TOPOLOGY_H
|
|
|
|
#define _LINUX_TOPOLOGY_H
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
#include <linux/arch_topology.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/smp.h>
|
2010-05-26 21:44:56 +00:00
|
|
|
#include <linux/percpu.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <asm/topology.h>
|
|
|
|
|
|
|
|
#ifndef nr_cpus_node
|
2009-03-13 04:19:46 +00:00
|
|
|
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
|
2005-04-16 22:20:36 +00:00
|
|
|
#endif
|
|
|
|
|
2008-04-05 01:11:11 +00:00
|
|
|
#define for_each_node_with_cpus(node) \
|
|
|
|
for_each_online_node(node) \
|
2005-04-16 22:20:36 +00:00
|
|
|
if (nr_cpus_node(node))
|
|
|
|
|
2008-12-09 17:49:50 +00:00
|
|
|
int arch_update_cpu_topology(void);
|
2008-03-12 17:31:59 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/* Conform to ACPI 2.0 SLIT distance definitions */
|
|
|
|
#define LOCAL_DISTANCE 10
|
|
|
|
#define REMOTE_DISTANCE 20
|
2021-01-22 12:39:43 +00:00
|
|
|
#define DISTANCE_BITS 8
|
2007-07-11 19:21:47 +00:00
|
|
|
#ifndef node_distance
|
2005-04-16 22:20:36 +00:00
|
|
|
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
|
|
|
|
#endif
|
2006-01-19 01:42:31 +00:00
|
|
|
#ifndef RECLAIM_DISTANCE
|
|
|
|
/*
|
|
|
|
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
|
|
|
|
* (in whatever arch specific measurement units returned by node_distance())
|
2016-07-28 22:46:32 +00:00
|
|
|
* and node_reclaim_mode is enabled then the VM will only call node_reclaim()
|
2014-06-04 23:07:14 +00:00
|
|
|
* on nodes within this distance.
|
2006-01-19 01:42:31 +00:00
|
|
|
*/
|
mm: increase RECLAIM_DISTANCE to 30
Recently, Robert Mueller reported (http://lkml.org/lkml/2010/9/12/236)
that zone_reclaim_mode doesn't work properly on his new NUMA server (Dual
Xeon E5520 + Intel S5520UR MB). He is using Cyrus IMAPd and it's built on
a very traditional single-process model.
* a master process which reads config files and manages the other
process
* multiple imapd processes, one per connection
* multiple pop3d processes, one per connection
* multiple lmtpd processes, one per connection
* periodical "cleanup" processes.
There are thousands of independent processes. The problem is, recent
Intel motherboard turn on zone_reclaim_mode by default and traditional
prefork model software don't work well on it. Unfortunatelly, such models
are still typical even in the 21st century. We can't ignore them.
This patch raises the zone_reclaim_mode threshold to 30. 30 doesn't have
any specific meaning. but 20 means that one-hop QPI/Hypertransport and
such relatively cheap 2-4 socket machine are often used for traditional
servers as above. The intention is that these machines don't use
zone_reclaim_mode.
Note: ia64 and Power have arch specific RECLAIM_DISTANCE definitions.
This patch doesn't change such high-end NUMA machine behavior.
Dave Hansen said:
: I know specifically of pieces of x86 hardware that set the information
: in the BIOS to '21' *specifically* so they'll get the zone_reclaim_mode
: behavior which that implies.
:
: They've done performance testing and run very large and scary benchmarks
: to make sure that they _want_ this turned on. What this means for them
: is that they'll probably be de-optimized, at least on newer versions of
: the kernel.
:
: If you want to do this for particular systems, maybe _that_'s what we
: should do. Have a list of specific configurations that need the
: defaults overridden either because they're buggy, or they have an
: unusual hardware configuration not really reflected in the distance
: table.
And later said:
: The original change in the hardware tables was for the benefit of a
: benchmark. Said benchmark isn't going to get run on mainline until the
: next batch of enterprise distros drops, at which point the hardware where
: this was done will be irrelevant for the benchmark. I'm sure any new
: hardware will just set this distance to another yet arbitrary value to
: make the kernel do what it wants. :)
:
: Also, when the hardware got _set_ to this initially, I complained. So, I
: guess I'm getting my way now, with this patch. I'm cool with it.
Reported-by: Robert Mueller <robm@fastmail.fm>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-06-15 22:08:20 +00:00
|
|
|
#define RECLAIM_DISTANCE 30
|
2006-01-19 01:42:31 +00:00
|
|
|
#endif
|
2019-08-08 19:53:01 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following tunable allows platforms to override the default node
|
|
|
|
* reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
|
|
|
|
* sufficiently fast that the default value actually hurts
|
|
|
|
* performance.
|
|
|
|
*
|
|
|
|
* AMD EPYC machines use this because even though the 2-hop distance
|
|
|
|
* is 32 (3.2x slower than a local memory access) performance actually
|
|
|
|
* *improves* if allowed to reclaim memory and load balance tasks
|
|
|
|
* between NUMA nodes 2-hops apart.
|
|
|
|
*/
|
|
|
|
extern int __read_mostly node_reclaim_distance;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
|
|
|
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
|
|
|
#endif
|
|
|
|
|
2010-05-26 21:44:56 +00:00
|
|
|
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
|
|
|
|
DECLARE_PER_CPU(int, numa_node);
|
|
|
|
|
|
|
|
#ifndef numa_node_id
|
|
|
|
/* Returns the number of the current Node. */
|
|
|
|
static inline int numa_node_id(void)
|
|
|
|
{
|
2014-04-07 22:39:38 +00:00
|
|
|
return raw_cpu_read(numa_node);
|
2010-05-26 21:44:56 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_node
|
|
|
|
static inline int cpu_to_node(int cpu)
|
|
|
|
{
|
|
|
|
return per_cpu(numa_node, cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_numa_node
|
|
|
|
static inline void set_numa_node(int node)
|
|
|
|
{
|
2012-05-11 07:35:27 +00:00
|
|
|
this_cpu_write(numa_node, node);
|
2010-05-26 21:44:56 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_cpu_numa_node
|
|
|
|
static inline void set_cpu_numa_node(int cpu, int node)
|
|
|
|
{
|
|
|
|
per_cpu(numa_node, cpu) = node;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
|
|
|
|
|
|
|
|
/* Returns the number of the current Node. */
|
|
|
|
#ifndef numa_node_id
|
|
|
|
static inline int numa_node_id(void)
|
|
|
|
{
|
|
|
|
return cpu_to_node(raw_smp_processor_id());
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
|
|
|
|
|
2010-05-26 21:45:00 +00:00
|
|
|
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
|
|
|
|
|
|
|
|
/*
|
|
|
|
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
|
|
|
|
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
|
|
|
|
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
|
|
|
|
*/
|
|
|
|
DECLARE_PER_CPU(int, _numa_mem_);
|
|
|
|
|
|
|
|
#ifndef set_numa_mem
|
|
|
|
static inline void set_numa_mem(int node)
|
|
|
|
{
|
2012-05-11 07:35:27 +00:00
|
|
|
this_cpu_write(_numa_mem_, node);
|
2010-05-26 21:45:00 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef numa_mem_id
|
|
|
|
/* Returns the number of the nearest Node with memory */
|
|
|
|
static inline int numa_mem_id(void)
|
|
|
|
{
|
2014-04-07 22:39:38 +00:00
|
|
|
return raw_cpu_read(_numa_mem_);
|
2010-05-26 21:45:00 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_mem
|
|
|
|
static inline int cpu_to_mem(int cpu)
|
|
|
|
{
|
|
|
|
return per_cpu(_numa_mem_, cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_cpu_numa_mem
|
|
|
|
static inline void set_cpu_numa_mem(int cpu, int node)
|
|
|
|
{
|
|
|
|
per_cpu(_numa_mem_, cpu) = node;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#else /* !CONFIG_HAVE_MEMORYLESS_NODES */
|
|
|
|
|
|
|
|
#ifndef numa_mem_id
|
|
|
|
/* Returns the number of the nearest Node with memory */
|
|
|
|
static inline int numa_mem_id(void)
|
|
|
|
{
|
|
|
|
return numa_node_id();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_mem
|
|
|
|
static inline int cpu_to_mem(int cpu)
|
|
|
|
{
|
|
|
|
return cpu_to_node(cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */
|
|
|
|
|
2021-11-29 13:03:07 +00:00
|
|
|
#if defined(topology_die_id) && defined(topology_die_cpumask)
|
|
|
|
#define TOPOLOGY_DIE_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:08 +00:00
|
|
|
#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
|
|
|
|
#define TOPOLOGY_CLUSTER_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#if defined(topology_book_id) && defined(topology_book_cpumask)
|
|
|
|
#define TOPOLOGY_BOOK_SYSFS
|
|
|
|
#endif
|
|
|
|
#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
|
|
|
|
#define TOPOLOGY_DRAWER_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:07 +00:00
|
|
|
|
2008-06-05 04:47:29 +00:00
|
|
|
#ifndef topology_physical_package_id
|
|
|
|
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2019-05-13 17:58:47 +00:00
|
|
|
#ifndef topology_die_id
|
|
|
|
#define topology_die_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2021-09-24 08:51:02 +00:00
|
|
|
#ifndef topology_cluster_id
|
|
|
|
#define topology_cluster_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2008-06-05 04:47:29 +00:00
|
|
|
#ifndef topology_core_id
|
|
|
|
#define topology_core_id(cpu) ((void)(cpu), 0)
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#ifndef topology_book_id
|
|
|
|
#define topology_book_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_drawer_id
|
|
|
|
#define topology_drawer_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2015-05-26 13:11:28 +00:00
|
|
|
#ifndef topology_sibling_cpumask
|
|
|
|
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
|
2009-01-11 05:58:08 +00:00
|
|
|
#endif
|
|
|
|
#ifndef topology_core_cpumask
|
|
|
|
#define topology_core_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2021-09-24 08:51:02 +00:00
|
|
|
#ifndef topology_cluster_cpumask
|
|
|
|
#define topology_cluster_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2019-05-13 17:58:56 +00:00
|
|
|
#ifndef topology_die_cpumask
|
|
|
|
#define topology_die_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#ifndef topology_book_cpumask
|
|
|
|
#define topology_book_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_drawer_cpumask
|
|
|
|
#define topology_drawer_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2008-06-05 04:47:29 +00:00
|
|
|
|
2020-08-07 07:45:16 +00:00
|
|
|
#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
|
2014-04-11 09:44:37 +00:00
|
|
|
static inline const struct cpumask *cpu_smt_mask(int cpu)
|
|
|
|
{
|
2015-05-26 13:11:28 +00:00
|
|
|
return topology_sibling_cpumask(cpu);
|
2014-04-11 09:44:37 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
sched: Add cluster scheduler level in core and related Kconfig for ARM64
This patch adds scheduler level for clusters and automatically enables
the load balance among clusters. It will directly benefit a lot of
workload which loves more resources such as memory bandwidth, caches.
Testing has widely been done in two different hardware configurations of
Kunpeng920:
24 cores in one NUMA(6 clusters in each NUMA node);
32 cores in one NUMA(8 clusters in each NUMA node)
Workload is running on either one NUMA node or four NUMA nodes, thus,
this can estimate the effect of cluster spreading w/ and w/o NUMA load
balance.
* Stream benchmark:
4threads stream (on 1NUMA * 24cores = 24cores)
stream stream
w/o patch w/ patch
MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%)
MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%)
MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%)
MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%)
6threads stream (on 1NUMA * 24cores = 24cores)
stream stream
w/o patch w/ patch
MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%)
MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%)
MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%)
MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%)
12threads stream (on 1NUMA * 24cores = 24cores)
stream stream
w/o patch w/ patch
MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%)
MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%)
MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%)
MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%)
Thus, it could help memory-bound workload especially under medium load.
Similar improvement is also seen in lkp-pbzip2:
* lkp-pbzip2 benchmark
2-96 threads (on 4NUMA * 24cores = 96cores)
lkp-pbzip2 lkp-pbzip2
w/o patch w/ patch
Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%*
Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%*
Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%*
Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%*
Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%*
Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%*
Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%*
Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%*
Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%*
2-24 threads (on 1NUMA * 24cores = 24cores)
lkp-pbzip2 lkp-pbzip2
w/o patch w/ patch
Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%*
Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%*
Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%*
Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%*
Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%*
Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%)
Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%*
In the case of 6 threads and 8 threads, we see the greatest performance
improvement.
Similar improvement can be seen on lkp-pixz though the improvement is
smaller:
* lkp-pixz benchmark
2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores)
lkp-pixz lkp-pixz
w/o patch w/ patch
Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%*
Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%)
Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%*
Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%*
Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%*
Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%*
Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%*
* SPECrate benchmark
4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores)
Base Base
Run Time Rate
------- ---------
4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3)
8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%)
16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6)
32 Copies(on 4NUMA * 32 cores = 128cores)
[w/o patch]
Base Base Base
Benchmarks Copies Run Time Rate
--------------- ------- --------- ---------
500.perlbench_r 32 584 87.2 *
502.gcc_r 32 503 90.2 *
505.mcf_r 32 745 69.4 *
520.omnetpp_r 32 1031 40.7 *
523.xalancbmk_r 32 597 56.6 *
525.x264_r 1 -- CE
531.deepsjeng_r 32 336 109 *
541.leela_r 32 556 95.4 *
548.exchange2_r 32 513 163 *
557.xz_r 32 530 65.2 *
Est. SPECrate2017_int_base 80.3
[w/ patch]
Base Base Base
Benchmarks Copies Run Time Rate
--------------- ------- --------- ---------
500.perlbench_r 32 580 87.8 (+0.688%) *
502.gcc_r 32 477 95.1 (+5.432%) *
505.mcf_r 32 644 80.3 (+13.574%) *
520.omnetpp_r 32 942 44.6 (+9.58%) *
523.xalancbmk_r 32 560 60.4 (+6.714%%) *
525.x264_r 1 -- CE
531.deepsjeng_r 32 337 109 (+0.000%) *
541.leela_r 32 554 95.6 (+0.210%) *
548.exchange2_r 32 515 163 (+0.000%) *
557.xz_r 32 524 66.0 (+1.227%) *
Est. SPECrate2017_int_base 83.7 (+4.062%)
On the other hand, it is slightly helpful to CPU-bound tasks like
kernbench:
* 24-96 threads kernbench (on 4NUMA * 24cores = 96cores)
kernbench kernbench
w/o cluster w/ cluster
Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%)
Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%)
Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%)
Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%)
Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%)
Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%)
Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%)
Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%)
Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%)
Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%*
Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%*
Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%)
Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%*
Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%*
Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%*
Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%)
Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%*
Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%)
Note this patch isn't an universal win, it might hurt those workload
which can benefit from packing. Though tasks which want to take
advantages of lower communication latency of one cluster won't
necessarily been packed in one cluster while kernel is not aware of
clusters, they have some chance to be randomly packed. But this
patch will make them more likely spread.
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2021-09-24 08:51:03 +00:00
|
|
|
#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask)
|
|
|
|
static inline const struct cpumask *cpu_cluster_mask(int cpu)
|
|
|
|
{
|
|
|
|
return topology_cluster_cpumask(cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2014-04-11 09:44:37 +00:00
|
|
|
static inline const struct cpumask *cpu_cpu_mask(int cpu)
|
|
|
|
{
|
|
|
|
return cpumask_of_node(cpu_to_node(cpu));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
#endif /* _LINUX_TOPOLOGY_H */
|