cpumask: re-introduce constant-sized cpumask optimizations

Commit aa47a7c215e7 ("lib/cpumask: deprecate nr_cpumask_bits") resulted
in the cpumask operations potentially becoming hugely less efficient,
because suddenly the cpumask was always considered to be variable-sized.

The optimization was then later added back in a limited form by commit
6f9c07be9d02 ("lib/cpumask: add FORCE_NR_CPUS config option"), but that
FORCE_NR_CPUS option is not useful in a generic kernel and more of a
special case for embedded situations with fixed hardware.

Instead, just re-introduce the optimization, with some changes.

Instead of depending on CPUMASK_OFFSTACK being false, and then always
using the full constant cpumask width, this introduces three different
cpumask "sizes":

 - the exact size (nr_cpumask_bits) remains identical to nr_cpu_ids.

   This is used for situations where we should use the exact size.

 - the "small" size (small_cpumask_bits) is the NR_CPUS constant if it
   fits in a single word and the bitmap operations thus end up able
   to trigger the "small_const_nbits()" optimizations.

   This is used for the operations that have optimized single-word
   cases that get inlined, notably the bit find and scanning functions.

 - the "large" size (large_cpumask_bits) is the NR_CPUS constant if it
   is an sufficiently small constant that makes simple "copy" and
   "clear" operations more efficient.

   This is arbitrarily set at four words or less.

As a an example of this situation, without this fixed size optimization,
cpumask_clear() will generate code like

        movl    nr_cpu_ids(%rip), %edx
        addq    $63, %rdx
        shrq    $3, %rdx
        andl    $-8, %edx
        callq   memset@PLT

on x86-64, because it would calculate the "exact" number of longwords
that need to be cleared.

In contrast, with this patch, using a MAX_CPU of 64 (which is quite a
reasonable value to use), the above becomes a single

	movq $0,cpumask

instruction instead, because instead of caring to figure out exactly how
many CPU's the system has, it just knows that the cpumask will be a
single word and can just clear it all.

Note that this does end up tightening the rules a bit from the original
version in another way: operations that set bits in the cpumask are now
limited to the actual nr_cpu_ids limit, whereas we used to do the
nr_cpumask_bits thing almost everywhere in the cpumask code.

But if you just clear bits, or scan for bits, we can use the simpler
compile-time constants.

In the process, remove 'cpumask_complement()' and 'for_each_cpu_not()'
which were not useful, and which fundamentally have to be limited to
'nr_cpu_ids'.  Better remove them now than have somebody introduce use
of them later.

Of course, on x86-64 with MAXSMP there is no sane small compile-time
constant for the cpumask sizes, and we end up using the actual CPU bits,
and will generate the above kind of horrors regardless.  Please don't
use MAXSMP unless you really expect to have machines with thousands of
cores.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Linus Torvalds 2023-03-04 13:35:43 -08:00
parent f915322fe0
commit 596ff4a09b
4 changed files with 72 additions and 72 deletions

View File

@ -226,7 +226,6 @@ ForEachMacros:
- 'for_each_console_srcu'
- 'for_each_cpu'
- 'for_each_cpu_and'
- 'for_each_cpu_not'
- 'for_each_cpu_wrap'
- 'for_each_dapm_widgets'
- 'for_each_dedup_cand'

View File

@ -783,11 +783,9 @@ __init void prefill_possible_map(void)
static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
{
cpumask_t tmp_map;
int cpu;
cpumask_complement(&tmp_map, cpu_present_mask);
cpu = cpumask_first(&tmp_map);
cpu = cpumask_first_zero(cpu_present_mask);
if (cpu >= nr_cpu_ids)
return -EINVAL;

View File

@ -50,8 +50,41 @@ static inline void set_nr_cpu_ids(unsigned int nr)
#endif
}
/* Deprecated. Always use nr_cpu_ids. */
#define nr_cpumask_bits nr_cpu_ids
/*
* We have several different "preferred sizes" for the cpumask
* operations, depending on operation.
*
* For example, the bitmap scanning and operating operations have
* optimized routines that work for the single-word case, but only when
* the size is constant. So if NR_CPUS fits in one single word, we are
* better off using that small constant, in order to trigger the
* optimized bit finding. That is 'small_cpumask_size'.
*
* The clearing and copying operations will similarly perform better
* with a constant size, but we limit that size arbitrarily to four
* words. We call this 'large_cpumask_size'.
*
* Finally, some operations just want the exact limit, either because
* they set bits or just don't have any faster fixed-sized versions. We
* call this just 'nr_cpumask_size'.
*
* Note that these optional constants are always guaranteed to be at
* least as big as 'nr_cpu_ids' itself is, and all our cpumask
* allocations are at least that size (see cpumask_size()). The
* optimization comes from being able to potentially use a compile-time
* constant instead of a run-time generated exact number of CPUs.
*/
#if NR_CPUS <= BITS_PER_LONG
#define small_cpumask_bits ((unsigned int)NR_CPUS)
#define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
#define small_cpumask_bits nr_cpu_ids
#define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
#define small_cpumask_bits nr_cpu_ids
#define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids
/*
* The following particular system cpumasks and operations manage
@ -126,7 +159,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
*/
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}
/**
@ -137,7 +170,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
*/
static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}
/**
@ -150,7 +183,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
static inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}
/**
@ -161,7 +194,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
*/
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}
/**
@ -177,7 +210,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}
/**
@ -192,7 +225,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}
#if NR_CPUS == 1
@ -235,7 +268,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
if (n != -1)
cpumask_check(n);
return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits, n + 1);
small_cpumask_bits, n + 1);
}
/**
@ -246,17 +279,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu(cpu, mask) \
for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
/**
* for_each_cpu_not - iterate over every cpu in a complemented mask
* @cpu: the (optionally unsigned) integer iterator
* @mask: the cpumask pointer
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_not(cpu, mask) \
for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
#if NR_CPUS == 1
static inline
@ -290,7 +313,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_wrap(cpu, mask, start) \
for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)
/**
* for_each_cpu_and - iterate over every cpu in both masks
@ -307,7 +330,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_and(cpu, mask1, mask2) \
for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
/**
* for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
@ -325,7 +348,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_andnot(cpu, mask1, mask2) \
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
/**
* cpumask_any_but - return a "random" in a cpumask, but not this one.
@ -356,7 +379,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
*/
static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}
/**
@ -372,7 +395,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
const struct cpumask *srcp2)
{
return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
nr_cpumask_bits, cpumask_check(cpu));
small_cpumask_bits, cpumask_check(cpu));
}
/**
@ -388,7 +411,7 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
const struct cpumask *srcp2)
{
return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
nr_cpumask_bits, cpumask_check(cpu));
small_cpumask_bits, cpumask_check(cpu));
}
/**
@ -408,7 +431,7 @@ unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp
return find_nth_and_andnot_bit(cpumask_bits(srcp1),
cpumask_bits(srcp2),
cpumask_bits(srcp3),
nr_cpumask_bits, cpumask_check(cpu));
small_cpumask_bits, cpumask_check(cpu));
}
#define CPU_BITS_NONE \
@ -495,10 +518,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
/**
* cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
* @dstp: the cpumask pointer
*
* Note: since we set bits, we should use the tighter 'bitmap_set()' with
* the eact number of bits, not 'bitmap_fill()' that will fill past the
* end.
*/
static inline void cpumask_setall(struct cpumask *dstp)
{
bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
bitmap_set(cpumask_bits(dstp), 0, nr_cpumask_bits);
}
/**
@ -507,7 +534,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
*/
static inline void cpumask_clear(struct cpumask *dstp)
{
bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}
/**
@ -523,7 +550,7 @@ static inline bool cpumask_and(struct cpumask *dstp,
const struct cpumask *src2p)
{
return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
cpumask_bits(src2p), small_cpumask_bits);
}
/**
@ -536,7 +563,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
const struct cpumask *src2p)
{
bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
cpumask_bits(src2p), small_cpumask_bits);
}
/**
@ -550,7 +577,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
const struct cpumask *src2p)
{
bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
cpumask_bits(src2p), small_cpumask_bits);
}
/**
@ -566,19 +593,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
const struct cpumask *src2p)
{
return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
}
/**
* cpumask_complement - *dstp = ~*srcp
* @dstp: the cpumask result
* @srcp: the input to invert
*/
static inline void cpumask_complement(struct cpumask *dstp,
const struct cpumask *srcp)
{
bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
nr_cpumask_bits);
cpumask_bits(src2p), small_cpumask_bits);
}
/**
@ -590,7 +605,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
small_cpumask_bits);
}
/**
@ -604,7 +619,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
const struct cpumask *src3p)
{
return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
cpumask_bits(src3p), nr_cpumask_bits);
cpumask_bits(src3p), small_cpumask_bits);
}
/**
@ -616,7 +631,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
small_cpumask_bits);
}
/**
@ -630,7 +645,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
small_cpumask_bits);
}
/**
@ -639,7 +654,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
*/
static inline bool cpumask_empty(const struct cpumask *srcp)
{
return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}
/**
@ -657,7 +672,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
*/
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}
/**
@ -668,7 +683,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
const struct cpumask *srcp2)
{
return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}
/**
@ -681,7 +696,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
const struct cpumask *srcp, int n)
{
bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
nr_cpumask_bits);
small_cpumask_bits);
}
/**
@ -705,7 +720,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
static inline void cpumask_copy(struct cpumask *dstp,
const struct cpumask *srcp)
{
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}
/**
@ -789,7 +804,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
*/
static inline unsigned int cpumask_size(void)
{
return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
}
/*

View File

@ -23,16 +23,6 @@
KUNIT_EXPECT_EQ_MSG((test), mask_weight, iter, MASK_MSG(mask)); \
} while (0)
#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask) \
do { \
const cpumask_t *m = (mask); \
int mask_weight = cpumask_weight(m); \
int cpu, iter = 0; \
for_each_cpu_not(cpu, m) \
iter++; \
KUNIT_EXPECT_EQ_MSG((test), nr_cpu_ids - mask_weight, iter, MASK_MSG(mask)); \
} while (0)
#define EXPECT_FOR_EACH_CPU_OP_EQ(test, op, mask1, mask2) \
do { \
const cpumask_t *m1 = (mask1); \
@ -77,7 +67,7 @@ static void test_cpumask_weight(struct kunit *test)
KUNIT_EXPECT_EQ_MSG(test, 0, cpumask_weight(&mask_empty), MASK_MSG(&mask_empty));
KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(cpu_possible_mask),
MASK_MSG(cpu_possible_mask));
KUNIT_EXPECT_EQ_MSG(test, nr_cpumask_bits, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
}
static void test_cpumask_first(struct kunit *test)
@ -113,14 +103,12 @@ static void test_cpumask_next(struct kunit *test)
static void test_cpumask_iterators(struct kunit *test)
{
EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty);
EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty);
EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty);
EXPECT_FOR_EACH_CPU_OP_EQ(test, and, &mask_empty, &mask_empty);
EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, &mask_empty);
EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, &mask_empty, &mask_empty);
EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask);
EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask);
EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask);
EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, cpu_possible_mask);
EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, cpu_possible_mask, &mask_empty);