mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-16 21:35:07 +00:00
a6478d69cf
Now that kernel mode NEON no longer disables preemption, using FP/SIMD in library code which is not obviously part of the crypto subsystem is no longer problematic, as it will no longer incur unexpected latencies. So accelerate the CRC-32 library code on arm64 to use a 4-way interleave, using PMULL instructions to implement the folding. On Apple M2, this results in a speedup of 2 - 2.8x when using input sizes of 1k - 8k. For smaller sizes, the overhead of preserving and restoring the FP/SIMD register file may not be worth it, so 1k is used as a threshold for choosing this code path. The coefficient tables were generated using code provided by Eric. [0] [0] https://github.com/ebiggers/libdeflate/blob/master/scripts/gen_crc32_multipliers.c Cc: Eric Biggers <ebiggers@kernel.org> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20241018075347.2821102-8-ardb+git@google.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
83 lines
2.1 KiB
C
83 lines
2.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#include <linux/crc32.h>
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/alternative.h>
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/neon.h>
|
|
#include <asm/simd.h>
|
|
|
|
#include <crypto/internal/simd.h>
|
|
|
|
// The minimum input length to consider the 4-way interleaved code path
|
|
static const size_t min_len = 1024;
|
|
|
|
asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
|
|
asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
|
|
asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
|
|
|
|
asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
|
|
asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
|
|
asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
|
|
|
|
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
|
|
{
|
|
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
|
|
return crc32_le_base(crc, p, len);
|
|
|
|
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
|
|
kernel_neon_begin();
|
|
crc = crc32_le_arm64_4way(crc, p, len);
|
|
kernel_neon_end();
|
|
|
|
p += round_down(len, 64);
|
|
len %= 64;
|
|
|
|
if (!len)
|
|
return crc;
|
|
}
|
|
|
|
return crc32_le_arm64(crc, p, len);
|
|
}
|
|
|
|
u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
|
|
{
|
|
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
|
|
return __crc32c_le_base(crc, p, len);
|
|
|
|
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
|
|
kernel_neon_begin();
|
|
crc = crc32c_le_arm64_4way(crc, p, len);
|
|
kernel_neon_end();
|
|
|
|
p += round_down(len, 64);
|
|
len %= 64;
|
|
|
|
if (!len)
|
|
return crc;
|
|
}
|
|
|
|
return crc32c_le_arm64(crc, p, len);
|
|
}
|
|
|
|
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
|
|
{
|
|
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
|
|
return crc32_be_base(crc, p, len);
|
|
|
|
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
|
|
kernel_neon_begin();
|
|
crc = crc32_be_arm64_4way(crc, p, len);
|
|
kernel_neon_end();
|
|
|
|
p += round_down(len, 64);
|
|
len %= 64;
|
|
|
|
if (!len)
|
|
return crc;
|
|
}
|
|
|
|
return crc32_be_arm64(crc, p, len);
|
|
}
|