mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-16 21:35:07 +00:00
f209132104
Similar to the syndrome calculation, the recovery algorithms also work on 64 bytes at a time to align with the L1 cache line size of current and future LoongArch cores (that we care about). Which means unrolled-by-4 LSX and unrolled-by-2 LASX code. The assembly is originally based on the x86 SSSE3/AVX2 ports, but register allocation has been redone to take advantage of LSX/LASX's 32 vector registers, and instruction sequence has been optimized to suit (e.g. LoongArch can perform per-byte srl and andi on vectors, but x86 cannot). Performance numbers measured by instrumenting the raid6test code, on a 3A5000 system clocked at 2.5GHz: > lasx 2data: 354.987 MiB/s > lasx datap: 350.430 MiB/s > lsx 2data: 340.026 MiB/s > lsx datap: 337.318 MiB/s > intx1 2data: 164.280 MiB/s > intx1 datap: 187.966 MiB/s Because recovery algorithms are chosen solely based on priority and availability, lasx is marked as priority 2 and lsx priority 1. At least for the current generation of LoongArch micro-architectures, LASX should always be faster than LSX whenever supported, and have similar power consumption characteristics (because the only known LASX-capable uarch, the LA464, always compute the full 256-bit result for vector ops). Acked-by: Song Liu <song@kernel.org> Signed-off-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
96 lines
3.4 KiB
Makefile
96 lines
3.4 KiB
Makefile
# SPDX-License-Identifier: GPL-2.0
|
|
obj-$(CONFIG_RAID6_PQ) += raid6_pq.o
|
|
|
|
raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
|
|
int8.o int16.o int32.o
|
|
|
|
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
|
|
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
|
|
vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
|
|
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
|
|
raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
|
|
raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
|
|
|
|
hostprogs += mktables
|
|
|
|
ifeq ($(CONFIG_ALTIVEC),y)
|
|
altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
|
|
# Enable <altivec.h>
|
|
altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
|
|
|
|
ifdef CONFIG_CC_IS_CLANG
|
|
# clang ppc port does not yet support -maltivec when -msoft-float is
|
|
# enabled. A future release of clang will resolve this
|
|
# https://bugs.llvm.org/show_bug.cgi?id=31177
|
|
CFLAGS_REMOVE_altivec1.o += -msoft-float
|
|
CFLAGS_REMOVE_altivec2.o += -msoft-float
|
|
CFLAGS_REMOVE_altivec4.o += -msoft-float
|
|
CFLAGS_REMOVE_altivec8.o += -msoft-float
|
|
CFLAGS_REMOVE_vpermxor1.o += -msoft-float
|
|
CFLAGS_REMOVE_vpermxor2.o += -msoft-float
|
|
CFLAGS_REMOVE_vpermxor4.o += -msoft-float
|
|
CFLAGS_REMOVE_vpermxor8.o += -msoft-float
|
|
endif
|
|
endif
|
|
|
|
# The GCC option -ffreestanding is required in order to compile code containing
|
|
# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
|
|
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
|
|
NEON_FLAGS := -ffreestanding
|
|
# Enable <arm_neon.h>
|
|
NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
|
|
ifeq ($(ARCH),arm)
|
|
NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
|
|
endif
|
|
CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
|
|
ifeq ($(ARCH),arm64)
|
|
CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
|
|
CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
|
|
CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
|
|
CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
|
|
CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
|
|
endif
|
|
endif
|
|
|
|
quiet_cmd_unroll = UNROLL $@
|
|
cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
|
|
|
|
targets += int1.c int2.c int4.c int8.c int16.c int32.c
|
|
$(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE
|
|
$(call if_changed,unroll)
|
|
|
|
CFLAGS_altivec1.o += $(altivec_flags)
|
|
CFLAGS_altivec2.o += $(altivec_flags)
|
|
CFLAGS_altivec4.o += $(altivec_flags)
|
|
CFLAGS_altivec8.o += $(altivec_flags)
|
|
targets += altivec1.c altivec2.c altivec4.c altivec8.c
|
|
$(obj)/altivec%.c: $(src)/altivec.uc $(src)/unroll.awk FORCE
|
|
$(call if_changed,unroll)
|
|
|
|
CFLAGS_vpermxor1.o += $(altivec_flags)
|
|
CFLAGS_vpermxor2.o += $(altivec_flags)
|
|
CFLAGS_vpermxor4.o += $(altivec_flags)
|
|
CFLAGS_vpermxor8.o += $(altivec_flags)
|
|
targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c
|
|
$(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
|
|
$(call if_changed,unroll)
|
|
|
|
CFLAGS_neon1.o += $(NEON_FLAGS)
|
|
CFLAGS_neon2.o += $(NEON_FLAGS)
|
|
CFLAGS_neon4.o += $(NEON_FLAGS)
|
|
CFLAGS_neon8.o += $(NEON_FLAGS)
|
|
targets += neon1.c neon2.c neon4.c neon8.c
|
|
$(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
|
|
$(call if_changed,unroll)
|
|
|
|
targets += s390vx8.c
|
|
$(obj)/s390vx%.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
|
|
$(call if_changed,unroll)
|
|
|
|
quiet_cmd_mktable = TABLE $@
|
|
cmd_mktable = $(obj)/mktables > $@
|
|
|
|
targets += tables.c
|
|
$(obj)/tables.c: $(obj)/mktables FORCE
|
|
$(call if_changed,mktable)
|