2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* include/linux/topology.h
|
|
|
|
*
|
|
|
|
* Written by: Matthew Dobson, IBM Corporation
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002, IBM Corp.
|
|
|
|
*
|
2008-01-25 20:08:20 +00:00
|
|
|
* All rights reserved.
|
2005-04-16 22:20:36 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful, but
|
|
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
|
|
* NON INFRINGEMENT. See the GNU General Public License for more
|
|
|
|
* details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
*
|
|
|
|
* Send feedback to <colpatch@us.ibm.com>
|
|
|
|
*/
|
|
|
|
#ifndef _LINUX_TOPOLOGY_H
|
|
|
|
#define _LINUX_TOPOLOGY_H
|
|
|
|
|
2019-06-27 19:52:58 +00:00
|
|
|
#include <linux/arch_topology.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/smp.h>
|
2010-05-26 21:44:56 +00:00
|
|
|
#include <linux/percpu.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <asm/topology.h>
|
|
|
|
|
|
|
|
#ifndef nr_cpus_node
|
2009-03-13 04:19:46 +00:00
|
|
|
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
|
2005-04-16 22:20:36 +00:00
|
|
|
#endif
|
|
|
|
|
2008-04-05 01:11:11 +00:00
|
|
|
#define for_each_node_with_cpus(node) \
|
|
|
|
for_each_online_node(node) \
|
2005-04-16 22:20:36 +00:00
|
|
|
if (nr_cpus_node(node))
|
|
|
|
|
2008-12-09 17:49:50 +00:00
|
|
|
int arch_update_cpu_topology(void);
|
2008-03-12 17:31:59 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/* Conform to ACPI 2.0 SLIT distance definitions */
|
|
|
|
#define LOCAL_DISTANCE 10
|
|
|
|
#define REMOTE_DISTANCE 20
|
2021-01-22 12:39:43 +00:00
|
|
|
#define DISTANCE_BITS 8
|
2007-07-11 19:21:47 +00:00
|
|
|
#ifndef node_distance
|
2005-04-16 22:20:36 +00:00
|
|
|
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
|
|
|
|
#endif
|
2006-01-19 01:42:31 +00:00
|
|
|
#ifndef RECLAIM_DISTANCE
|
|
|
|
/*
|
|
|
|
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
|
|
|
|
* (in whatever arch specific measurement units returned by node_distance())
|
2016-07-28 22:46:32 +00:00
|
|
|
* and node_reclaim_mode is enabled then the VM will only call node_reclaim()
|
2014-06-04 23:07:14 +00:00
|
|
|
* on nodes within this distance.
|
2006-01-19 01:42:31 +00:00
|
|
|
*/
|
mm: increase RECLAIM_DISTANCE to 30
Recently, Robert Mueller reported (http://lkml.org/lkml/2010/9/12/236)
that zone_reclaim_mode doesn't work properly on his new NUMA server (Dual
Xeon E5520 + Intel S5520UR MB). He is using Cyrus IMAPd and it's built on
a very traditional single-process model.
* a master process which reads config files and manages the other
process
* multiple imapd processes, one per connection
* multiple pop3d processes, one per connection
* multiple lmtpd processes, one per connection
* periodical "cleanup" processes.
There are thousands of independent processes. The problem is, recent
Intel motherboard turn on zone_reclaim_mode by default and traditional
prefork model software don't work well on it. Unfortunatelly, such models
are still typical even in the 21st century. We can't ignore them.
This patch raises the zone_reclaim_mode threshold to 30. 30 doesn't have
any specific meaning. but 20 means that one-hop QPI/Hypertransport and
such relatively cheap 2-4 socket machine are often used for traditional
servers as above. The intention is that these machines don't use
zone_reclaim_mode.
Note: ia64 and Power have arch specific RECLAIM_DISTANCE definitions.
This patch doesn't change such high-end NUMA machine behavior.
Dave Hansen said:
: I know specifically of pieces of x86 hardware that set the information
: in the BIOS to '21' *specifically* so they'll get the zone_reclaim_mode
: behavior which that implies.
:
: They've done performance testing and run very large and scary benchmarks
: to make sure that they _want_ this turned on. What this means for them
: is that they'll probably be de-optimized, at least on newer versions of
: the kernel.
:
: If you want to do this for particular systems, maybe _that_'s what we
: should do. Have a list of specific configurations that need the
: defaults overridden either because they're buggy, or they have an
: unusual hardware configuration not really reflected in the distance
: table.
And later said:
: The original change in the hardware tables was for the benefit of a
: benchmark. Said benchmark isn't going to get run on mainline until the
: next batch of enterprise distros drops, at which point the hardware where
: this was done will be irrelevant for the benchmark. I'm sure any new
: hardware will just set this distance to another yet arbitrary value to
: make the kernel do what it wants. :)
:
: Also, when the hardware got _set_ to this initially, I complained. So, I
: guess I'm getting my way now, with this patch. I'm cool with it.
Reported-by: Robert Mueller <robm@fastmail.fm>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-06-15 22:08:20 +00:00
|
|
|
#define RECLAIM_DISTANCE 30
|
2006-01-19 01:42:31 +00:00
|
|
|
#endif
|
2019-08-08 19:53:01 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following tunable allows platforms to override the default node
|
|
|
|
* reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
|
|
|
|
* sufficiently fast that the default value actually hurts
|
|
|
|
* performance.
|
|
|
|
*
|
|
|
|
* AMD EPYC machines use this because even though the 2-hop distance
|
|
|
|
* is 32 (3.2x slower than a local memory access) performance actually
|
|
|
|
* *improves* if allowed to reclaim memory and load balance tasks
|
|
|
|
* between NUMA nodes 2-hops apart.
|
|
|
|
*/
|
|
|
|
extern int __read_mostly node_reclaim_distance;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
|
|
|
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
|
|
|
#endif
|
|
|
|
|
2010-05-26 21:44:56 +00:00
|
|
|
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
|
|
|
|
DECLARE_PER_CPU(int, numa_node);
|
|
|
|
|
|
|
|
#ifndef numa_node_id
|
|
|
|
/* Returns the number of the current Node. */
|
|
|
|
static inline int numa_node_id(void)
|
|
|
|
{
|
2014-04-07 22:39:38 +00:00
|
|
|
return raw_cpu_read(numa_node);
|
2010-05-26 21:44:56 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_node
|
|
|
|
static inline int cpu_to_node(int cpu)
|
|
|
|
{
|
|
|
|
return per_cpu(numa_node, cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_numa_node
|
|
|
|
static inline void set_numa_node(int node)
|
|
|
|
{
|
2012-05-11 07:35:27 +00:00
|
|
|
this_cpu_write(numa_node, node);
|
2010-05-26 21:44:56 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_cpu_numa_node
|
|
|
|
static inline void set_cpu_numa_node(int cpu, int node)
|
|
|
|
{
|
|
|
|
per_cpu(numa_node, cpu) = node;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
|
|
|
|
|
|
|
|
/* Returns the number of the current Node. */
|
|
|
|
#ifndef numa_node_id
|
|
|
|
static inline int numa_node_id(void)
|
|
|
|
{
|
|
|
|
return cpu_to_node(raw_smp_processor_id());
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
|
|
|
|
|
2010-05-26 21:45:00 +00:00
|
|
|
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
|
|
|
|
|
|
|
|
/*
|
|
|
|
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
|
|
|
|
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
|
|
|
|
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
|
|
|
|
*/
|
|
|
|
DECLARE_PER_CPU(int, _numa_mem_);
|
|
|
|
|
|
|
|
#ifndef set_numa_mem
|
|
|
|
static inline void set_numa_mem(int node)
|
|
|
|
{
|
2012-05-11 07:35:27 +00:00
|
|
|
this_cpu_write(_numa_mem_, node);
|
2010-05-26 21:45:00 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef numa_mem_id
|
|
|
|
/* Returns the number of the nearest Node with memory */
|
|
|
|
static inline int numa_mem_id(void)
|
|
|
|
{
|
2014-04-07 22:39:38 +00:00
|
|
|
return raw_cpu_read(_numa_mem_);
|
2010-05-26 21:45:00 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_mem
|
|
|
|
static inline int cpu_to_mem(int cpu)
|
|
|
|
{
|
|
|
|
return per_cpu(_numa_mem_, cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef set_cpu_numa_mem
|
|
|
|
static inline void set_cpu_numa_mem(int cpu, int node)
|
|
|
|
{
|
|
|
|
per_cpu(_numa_mem_, cpu) = node;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#else /* !CONFIG_HAVE_MEMORYLESS_NODES */
|
|
|
|
|
|
|
|
#ifndef numa_mem_id
|
|
|
|
/* Returns the number of the nearest Node with memory */
|
|
|
|
static inline int numa_mem_id(void)
|
|
|
|
{
|
|
|
|
return numa_node_id();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef cpu_to_mem
|
|
|
|
static inline int cpu_to_mem(int cpu)
|
|
|
|
{
|
|
|
|
return cpu_to_node(cpu);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */
|
|
|
|
|
2021-11-29 13:03:07 +00:00
|
|
|
#if defined(topology_die_id) && defined(topology_die_cpumask)
|
|
|
|
#define TOPOLOGY_DIE_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:08 +00:00
|
|
|
#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
|
|
|
|
#define TOPOLOGY_CLUSTER_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#if defined(topology_book_id) && defined(topology_book_cpumask)
|
|
|
|
#define TOPOLOGY_BOOK_SYSFS
|
|
|
|
#endif
|
|
|
|
#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
|
|
|
|
#define TOPOLOGY_DRAWER_SYSFS
|
|
|
|
#endif
|
2021-11-29 13:03:07 +00:00
|
|
|
|
2008-06-05 04:47:29 +00:00
|
|
|
#ifndef topology_physical_package_id
|
|
|
|
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2019-05-13 17:58:47 +00:00
|
|
|
#ifndef topology_die_id
|
|
|
|
#define topology_die_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2021-09-24 08:51:02 +00:00
|
|
|
#ifndef topology_cluster_id
|
|
|
|
#define topology_cluster_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2008-06-05 04:47:29 +00:00
|
|
|
#ifndef topology_core_id
|
|
|
|
#define topology_core_id(cpu) ((void)(cpu), 0)
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#ifndef topology_book_id
|
|
|
|
#define topology_book_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_drawer_id
|
|
|
|
#define topology_drawer_id(cpu) ((void)(cpu), -1)
|
|
|
|
#endif
|
2022-01-31 23:01:11 +00:00
|
|
|
#ifndef topology_ppin
|
|
|
|
#define topology_ppin(cpu) ((void)(cpu), 0ull)
|
|
|
|
#endif
|
2015-05-26 13:11:28 +00:00
|
|
|
#ifndef topology_sibling_cpumask
|
|
|
|
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
|
2009-01-11 05:58:08 +00:00
|
|
|
#endif
|
|
|
|
#ifndef topology_core_cpumask
|
|
|
|
#define topology_core_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2021-09-24 08:51:02 +00:00
|
|
|
#ifndef topology_cluster_cpumask
|
|
|
|
#define topology_cluster_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2019-05-13 17:58:56 +00:00
|
|
|
#ifndef topology_die_cpumask
|
|
|
|
#define topology_die_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2021-11-29 13:03:09 +00:00
|
|
|
#ifndef topology_book_cpumask
|
|
|
|
#define topology_book_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
|
|
|
#ifndef topology_drawer_cpumask
|
|
|
|
#define topology_drawer_cpumask(cpu) cpumask_of(cpu)
|
|
|
|
#endif
|
2008-06-05 04:47:29 +00:00
|
|
|
|
2020-08-07 07:45:16 +00:00
|
|
|
#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
|
2014-04-11 09:44:37 +00:00
|
|
|
static inline const struct cpumask *cpu_smt_mask(int cpu)
|
|
|
|
{
|
2015-05-26 13:11:28 +00:00
|
|
|
return topology_sibling_cpumask(cpu);
|
2014-04-11 09:44:37 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline const struct cpumask *cpu_cpu_mask(int cpu)
|
|
|
|
{
|
|
|
|
return cpumask_of_node(cpu_to_node(cpu));
|
|
|
|
}
|
|
|
|
|
2023-01-21 04:24:30 +00:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
|
2023-01-21 04:24:33 +00:00
|
|
|
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
|
2023-01-21 04:24:30 +00:00
|
|
|
#else
|
|
|
|
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
|
|
|
|
{
|
2023-08-19 14:12:36 +00:00
|
|
|
return cpumask_nth_and(cpu, cpus, cpu_online_mask);
|
2023-01-21 04:24:30 +00:00
|
|
|
}
|
2023-01-21 04:24:33 +00:00
|
|
|
|
|
|
|
static inline const struct cpumask *
|
|
|
|
sched_numa_hop_mask(unsigned int node, unsigned int hops)
|
|
|
|
{
|
|
|
|
return ERR_PTR(-EOPNOTSUPP);
|
|
|
|
}
|
2023-01-21 04:24:30 +00:00
|
|
|
#endif /* CONFIG_NUMA */
|
2014-04-11 09:44:37 +00:00
|
|
|
|
2023-01-21 04:24:34 +00:00
|
|
|
/**
|
|
|
|
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
|
|
|
|
* from a given node.
|
|
|
|
* @mask: the iteration variable.
|
|
|
|
* @node: the NUMA node to start the search from.
|
|
|
|
*
|
|
|
|
* Requires rcu_lock to be held.
|
|
|
|
*
|
|
|
|
* Yields cpu_online_mask for @node == NUMA_NO_NODE.
|
|
|
|
*/
|
|
|
|
#define for_each_numa_hop_mask(mask, node) \
|
|
|
|
for (unsigned int __hops = 0; \
|
|
|
|
mask = (node != NUMA_NO_NODE || __hops) ? \
|
|
|
|
sched_numa_hop_mask(node, __hops) : \
|
|
|
|
cpu_online_mask, \
|
|
|
|
!IS_ERR_OR_NULL(mask); \
|
|
|
|
__hops++)
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
#endif /* _LINUX_TOPOLOGY_H */
|