lib/crc32: improve support for arch-specific overrides

Currently the CRC32 library functions are defined as weak symbols, and
the arm64 and riscv architectures override them.

This method of arch-specific overrides has the limitation that it only
works when both the base and arch code is built-in.  Also, it makes the
arch-specific code be silently not used if it is accidentally built with
lib-y instead of obj-y; unfortunately the RISC-V code does this.

This commit reorganizes the code to have explicit *_arch() functions
that are called when they are enabled, similar to how some of the crypto
library code works (e.g. chacha_crypt() calls chacha_crypt_arch()).

Make the existing kconfig choice for the CRC32 implementation also
control whether the arch-optimized implementation (if one is available)
is enabled or not.  Make it enabled by default if CRC32 is also enabled.

The result is that arch-optimized CRC32 library functions will be
included automatically when appropriate, but it is now possible to
disable them.  They can also now be built as a loadable module if the
CRC32 library functions happen to be used only by loadable modules, in
which case the arch and base CRC32 modules will be automatically loaded
via direct symbol dependency when appropriate.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241202010844.144356-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
This commit is contained in:
Eric Biggers 2024-12-01 17:08:27 -08:00
parent 0a499a7e98
commit d36cebe03c
11 changed files with 118 additions and 51 deletions

View File

@ -21,6 +21,7 @@ config ARM64
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CC_PLATFORM
select ARCH_HAS_CRC32
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE

View File

@ -13,7 +13,8 @@ endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o
obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
crc32-arm64-y := crc32.o crc32-glue.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o

View File

@ -2,6 +2,7 @@
#include <linux/crc32.h>
#include <linux/linkage.h>
#include <linux/module.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
@ -21,7 +22,7 @@ asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
{
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
return crc32_le_base(crc, p, len);
@ -40,8 +41,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
return crc32_le_arm64(crc, p, len);
}
EXPORT_SYMBOL(crc32_le_arch);
u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
{
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
return crc32c_le_base(crc, p, len);
@ -60,8 +62,9 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
return crc32c_le_arm64(crc, p, len);
}
EXPORT_SYMBOL(crc32c_le_arch);
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
{
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
return crc32_be_base(crc, p, len);
@ -80,3 +83,7 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
return crc32_be_arm64(crc, p, len);
}
EXPORT_SYMBOL(crc32_be_arch);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("arm64-optimized CRC32 functions");

View File

@ -24,6 +24,7 @@ config RISCV
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
select ARCH_HAS_CRC32 if RISCV_ISA_ZBC
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_VM_PGTABLE

View File

@ -15,8 +15,7 @@ endif
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
lib-$(CONFIG_RISCV_ISA_ZBC) += crc32.o
obj-$(CONFIG_CRC32_ARCH) += crc32-riscv.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_RISCV_ISA_V) += xor.o
lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o

View File

@ -14,6 +14,7 @@
#include <linux/crc32poly.h>
#include <linux/crc32.h>
#include <linux/byteorder/generic.h>
#include <linux/module.h>
/*
* Refer to https://www.corsix.org/content/barrett-reduction-polynomials for
@ -217,17 +218,19 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
return crc_fb(crc, p, len);
}
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE,
crc32_le_base);
}
EXPORT_SYMBOL(crc32_le_arch);
u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, CRC32C_POLY_LE,
CRC32C_POLY_QT_LE, crc32c_le_base);
}
EXPORT_SYMBOL(crc32c_le_arch);
static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
size_t len)
@ -253,7 +256,7 @@ static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p,
return crc;
}
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
{
size_t offset, head_len, tail_len;
unsigned long const *p_ul;
@ -292,3 +295,7 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
legacy:
return crc32_be_base(crc, p, len);
}
EXPORT_SYMBOL(crc32_be_arch);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Accelerated CRC32 implementation with Zbc extension");

View File

@ -160,12 +160,12 @@ static struct shash_alg algs[] = {{
static int __init crc32_mod_init(void)
{
/* register the arch flavor only if it differs from the generic one */
return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
}
static void __exit crc32_mod_fini(void)
{
crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
}
subsys_initcall(crc32_mod_init);

View File

@ -200,12 +200,12 @@ static struct shash_alg algs[] = {{
static int __init crc32c_mod_init(void)
{
/* register the arch flavor only if it differs from the generic one */
return crypto_register_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base));
return crypto_register_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
}
static void __exit crc32c_mod_fini(void)
{
crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &crc32c_le_base));
crypto_unregister_shashes(algs, 1 + IS_ENABLED(CONFIG_CRC32_ARCH));
}
subsys_initcall(crc32c_mod_init);

View File

@ -8,10 +8,34 @@
#include <linux/types.h>
#include <linux/bitrev.h>
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len);
u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len);
u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len);
u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len);
u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len);
u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len);
static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len)
{
if (IS_ENABLED(CONFIG_CRC32_ARCH))
return crc32_le_arch(crc, p, len);
return crc32_le_base(crc, p, len);
}
static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len)
{
if (IS_ENABLED(CONFIG_CRC32_ARCH))
return crc32_be_arch(crc, p, len);
return crc32_be_base(crc, p, len);
}
/* TODO: leading underscores should be dropped once callers have been updated */
static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len)
{
if (IS_ENABLED(CONFIG_CRC32_ARCH))
return crc32c_le_arch(crc, p, len);
return crc32c_le_base(crc, p, len);
}
/**
* crc32_le_combine - Combine two crc32 check values into one. For two
@ -38,9 +62,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
return crc32_le_shift(crc1, len2) ^ crc2;
}
u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len);
u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len);
/**
* __crc32c_le_combine - Combine two crc32c check values into one. For two
* sequences of bytes, seq1 and seq2 with lengths len1

View File

@ -190,6 +190,9 @@ config CRC32
the kernel tree does. Such modules that use library CRC32/CRC32c
functions require M here.
config ARCH_HAS_CRC32
bool
config CRC32_SELFTEST
tristate "CRC32 perform self test on init"
depends on CRC32
@ -202,24 +205,39 @@ config CRC32_SELFTEST
choice
prompt "CRC32 implementation"
depends on CRC32
default CRC32_SLICEBY8
default CRC32_IMPL_ARCH_PLUS_SLICEBY8 if ARCH_HAS_CRC32
default CRC32_IMPL_SLICEBY8 if !ARCH_HAS_CRC32
help
This option allows a kernel builder to override the default choice
of CRC32 algorithm. Choose the default ("slice by 8") unless you
know that you need one of the others.
This option allows you to override the default choice of CRC32
implementation. Choose the default unless you know that you need one
of the others.
config CRC32_SLICEBY8
config CRC32_IMPL_ARCH_PLUS_SLICEBY8
bool "Arch-optimized, with fallback to slice-by-8" if ARCH_HAS_CRC32
help
Use architecture-optimized implementation of CRC32. Fall back to
slice-by-8 in cases where the arch-optimized implementation cannot be
used, e.g. if the CPU lacks support for the needed instructions.
This is the default when an arch-optimized implementation exists.
config CRC32_IMPL_ARCH_PLUS_SLICEBY1
bool "Arch-optimized, with fallback to slice-by-1" if ARCH_HAS_CRC32
help
Use architecture-optimized implementation of CRC32, but fall back to
slice-by-1 instead of slice-by-8 in order to reduce the binary size.
config CRC32_IMPL_SLICEBY8
bool "Slice by 8 bytes"
help
Calculate checksum 8 bytes at a time with a clever slicing algorithm.
This is the fastest algorithm, but comes with a 8KiB lookup table.
Most modern processors have enough cache to hold this table without
thrashing the cache.
This is much slower than the architecture-optimized implementation of
CRC32 (if the selected arch has one), but it is portable and is the
fastest implementation when no arch-optimized implementation is
available. It uses an 8KiB lookup table. Most modern processors have
enough cache to hold this table without thrashing the cache.
This is the default implementation choice. Choose this one unless
you have a good reason not to.
config CRC32_SLICEBY4
config CRC32_IMPL_SLICEBY4
bool "Slice by 4 bytes"
help
Calculate checksum 4 bytes at a time with a clever slicing algorithm.
@ -228,15 +246,15 @@ config CRC32_SLICEBY4
Only choose this option if you know what you are doing.
config CRC32_SARWATE
bool "Sarwate's Algorithm (one byte at a time)"
config CRC32_IMPL_SLICEBY1
bool "Slice by 1 byte (Sarwate's algorithm)"
help
Calculate checksum a byte at a time using Sarwate's algorithm. This
is not particularly fast, but has a small 256 byte lookup table.
is not particularly fast, but has a small 1KiB lookup table.
Only choose this option if you know what you are doing.
config CRC32_BIT
config CRC32_IMPL_BIT
bool "Classic Algorithm (one bit at a time)"
help
Calculate checksum one bit at a time. This is VERY slow, but has
@ -246,6 +264,26 @@ config CRC32_BIT
endchoice
config CRC32_ARCH
tristate
default CRC32 if CRC32_IMPL_ARCH_PLUS_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
config CRC32_SLICEBY8
bool
default y if CRC32_IMPL_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY8
config CRC32_SLICEBY4
bool
default y if CRC32_IMPL_SLICEBY4
config CRC32_SARWATE
bool
default y if CRC32_IMPL_SLICEBY1 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
config CRC32_BIT
bool
default y if CRC32_IMPL_BIT
config CRC64
tristate "CRC64 functions"
help

View File

@ -183,35 +183,27 @@ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
}
#if CRC_LE_BITS == 1
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE);
}
#else
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
{
return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
}
#endif
EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(__crc32c_le);
u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
EXPORT_SYMBOL(crc32_le_base);
u32 __pure crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);
EXPORT_SYMBOL(crc32c_le_base);
u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be);
/*
* This multiplies the polynomials x and y modulo the given modulus.
* This follows the "little-endian" CRC convention that the lsbit
@ -335,14 +327,14 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
}
#if CRC_BE_BITS == 1
u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
{
return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
}
#else
u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
{
return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE);
}
#endif
EXPORT_SYMBOL(crc32_be);
EXPORT_SYMBOL(crc32_be_base);