mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2024-12-28 16:52:18 +00:00
raid6: Add LoongArch SIMD recovery implementation
Similar to the syndrome calculation, the recovery algorithms also work on 64 bytes at a time to align with the L1 cache line size of current and future LoongArch cores (that we care about). Which means unrolled-by-4 LSX and unrolled-by-2 LASX code. The assembly is originally based on the x86 SSSE3/AVX2 ports, but register allocation has been redone to take advantage of LSX/LASX's 32 vector registers, and instruction sequence has been optimized to suit (e.g. LoongArch can perform per-byte srl and andi on vectors, but x86 cannot). Performance numbers measured by instrumenting the raid6test code, on a 3A5000 system clocked at 2.5GHz: > lasx 2data: 354.987 MiB/s > lasx datap: 350.430 MiB/s > lsx 2data: 340.026 MiB/s > lsx datap: 337.318 MiB/s > intx1 2data: 164.280 MiB/s > intx1 datap: 187.966 MiB/s Because recovery algorithms are chosen solely based on priority and availability, lasx is marked as priority 2 and lsx priority 1. At least for the current generation of LoongArch micro-architectures, LASX should always be faster than LSX whenever supported, and have similar power consumption characteristics (because the only known LASX-capable uarch, the LA464, always compute the full 256-bit result for vector ops). Acked-by: Song Liu <song@kernel.org> Signed-off-by: WANG Xuerui <git@xen0n.name> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
This commit is contained in:
parent
8f3f06dfd6
commit
f209132104
@ -125,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2;
|
||||
extern const struct raid6_recov_calls raid6_recov_avx512;
|
||||
extern const struct raid6_recov_calls raid6_recov_s390xc;
|
||||
extern const struct raid6_recov_calls raid6_recov_neon;
|
||||
extern const struct raid6_recov_calls raid6_recov_lsx;
|
||||
extern const struct raid6_recov_calls raid6_recov_lasx;
|
||||
|
||||
extern const struct raid6_calls raid6_neonx1;
|
||||
extern const struct raid6_calls raid6_neonx2;
|
||||
|
@ -9,7 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
|
||||
vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
|
||||
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
|
||||
raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
|
||||
raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o
|
||||
raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
|
||||
|
||||
hostprogs += mktables
|
||||
|
||||
|
@ -111,6 +111,14 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
|
||||
#endif
|
||||
#if defined(CONFIG_KERNEL_MODE_NEON)
|
||||
&raid6_recov_neon,
|
||||
#endif
|
||||
#ifdef CONFIG_LOONGARCH
|
||||
#ifdef CONFIG_CPU_HAS_LASX
|
||||
&raid6_recov_lasx,
|
||||
#endif
|
||||
#ifdef CONFIG_CPU_HAS_LSX
|
||||
&raid6_recov_lsx,
|
||||
#endif
|
||||
#endif
|
||||
&raid6_recov_intx1,
|
||||
NULL
|
||||
|
513
lib/raid6/recov_loongarch_simd.c
Normal file
513
lib/raid6/recov_loongarch_simd.c
Normal file
@ -0,0 +1,513 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
|
||||
*
|
||||
* Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
|
||||
*
|
||||
* Originally based on recov_avx2.c and recov_ssse3.c:
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation
|
||||
* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*/
|
||||
|
||||
#include <linux/raid/pq.h>
|
||||
#include "loongarch.h"
|
||||
|
||||
/*
|
||||
* Unlike with the syndrome calculation algorithms, there's no boot-time
|
||||
* selection of recovery algorithms by benchmarking, so we have to specify
|
||||
* the priorities and hope the future cores will all have decent vector
|
||||
* support (i.e. no LASX slower than LSX, or even scalar code).
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_CPU_HAS_LSX
|
||||
static int raid6_has_lsx(void)
|
||||
{
|
||||
return cpu_has_lsx;
|
||||
}
|
||||
|
||||
static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
|
||||
int failb, void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dp, *dq;
|
||||
const u8 *pbmul; /* P multiplier table for B data */
|
||||
const u8 *qmul; /* Q multiplier table (for both) */
|
||||
|
||||
p = (u8 *)ptrs[disks - 2];
|
||||
q = (u8 *)ptrs[disks - 1];
|
||||
|
||||
/*
|
||||
* Compute syndrome with zero for the missing data pages
|
||||
* Use the dead data pages as temporary storage for
|
||||
* delta p and delta q
|
||||
*/
|
||||
dp = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 2] = dp;
|
||||
dq = (u8 *)ptrs[failb];
|
||||
ptrs[failb] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dp;
|
||||
ptrs[failb] = dq;
|
||||
ptrs[disks - 2] = p;
|
||||
ptrs[disks - 1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
|
||||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/*
|
||||
* vr20, vr21: qmul
|
||||
* vr22, vr23: pbmul
|
||||
*/
|
||||
asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
|
||||
asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
|
||||
asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
|
||||
asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
|
||||
|
||||
while (bytes) {
|
||||
/* vr4 - vr7: Q */
|
||||
asm volatile("vld $vr4, %0" : : "m" (q[0]));
|
||||
asm volatile("vld $vr5, %0" : : "m" (q[16]));
|
||||
asm volatile("vld $vr6, %0" : : "m" (q[32]));
|
||||
asm volatile("vld $vr7, %0" : : "m" (q[48]));
|
||||
/* vr4 - vr7: Q + Qxy */
|
||||
asm volatile("vld $vr8, %0" : : "m" (dq[0]));
|
||||
asm volatile("vld $vr9, %0" : : "m" (dq[16]));
|
||||
asm volatile("vld $vr10, %0" : : "m" (dq[32]));
|
||||
asm volatile("vld $vr11, %0" : : "m" (dq[48]));
|
||||
asm volatile("vxor.v $vr4, $vr4, $vr8");
|
||||
asm volatile("vxor.v $vr5, $vr5, $vr9");
|
||||
asm volatile("vxor.v $vr6, $vr6, $vr10");
|
||||
asm volatile("vxor.v $vr7, $vr7, $vr11");
|
||||
/* vr0 - vr3: P */
|
||||
asm volatile("vld $vr0, %0" : : "m" (p[0]));
|
||||
asm volatile("vld $vr1, %0" : : "m" (p[16]));
|
||||
asm volatile("vld $vr2, %0" : : "m" (p[32]));
|
||||
asm volatile("vld $vr3, %0" : : "m" (p[48]));
|
||||
/* vr0 - vr3: P + Pxy */
|
||||
asm volatile("vld $vr8, %0" : : "m" (dp[0]));
|
||||
asm volatile("vld $vr9, %0" : : "m" (dp[16]));
|
||||
asm volatile("vld $vr10, %0" : : "m" (dp[32]));
|
||||
asm volatile("vld $vr11, %0" : : "m" (dp[48]));
|
||||
asm volatile("vxor.v $vr0, $vr0, $vr8");
|
||||
asm volatile("vxor.v $vr1, $vr1, $vr9");
|
||||
asm volatile("vxor.v $vr2, $vr2, $vr10");
|
||||
asm volatile("vxor.v $vr3, $vr3, $vr11");
|
||||
|
||||
/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
|
||||
asm volatile("vsrli.b $vr8, $vr4, 4");
|
||||
asm volatile("vsrli.b $vr9, $vr5, 4");
|
||||
asm volatile("vsrli.b $vr10, $vr6, 4");
|
||||
asm volatile("vsrli.b $vr11, $vr7, 4");
|
||||
/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
|
||||
asm volatile("vandi.b $vr4, $vr4, 0x0f");
|
||||
asm volatile("vandi.b $vr5, $vr5, 0x0f");
|
||||
asm volatile("vandi.b $vr6, $vr6, 0x0f");
|
||||
asm volatile("vandi.b $vr7, $vr7, 0x0f");
|
||||
/* lookup from qmul[0] */
|
||||
asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
|
||||
asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
|
||||
asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
|
||||
asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
|
||||
/* lookup from qmul[16] */
|
||||
asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
|
||||
asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
|
||||
asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
|
||||
asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
|
||||
/* vr16 - vr19: B(Q + Qxy) */
|
||||
asm volatile("vxor.v $vr16, $vr8, $vr4");
|
||||
asm volatile("vxor.v $vr17, $vr9, $vr5");
|
||||
asm volatile("vxor.v $vr18, $vr10, $vr6");
|
||||
asm volatile("vxor.v $vr19, $vr11, $vr7");
|
||||
|
||||
/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
|
||||
asm volatile("vsrli.b $vr4, $vr0, 4");
|
||||
asm volatile("vsrli.b $vr5, $vr1, 4");
|
||||
asm volatile("vsrli.b $vr6, $vr2, 4");
|
||||
asm volatile("vsrli.b $vr7, $vr3, 4");
|
||||
/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
|
||||
asm volatile("vandi.b $vr12, $vr0, 0x0f");
|
||||
asm volatile("vandi.b $vr13, $vr1, 0x0f");
|
||||
asm volatile("vandi.b $vr14, $vr2, 0x0f");
|
||||
asm volatile("vandi.b $vr15, $vr3, 0x0f");
|
||||
/* lookup from pbmul[0] */
|
||||
asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
|
||||
asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
|
||||
asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
|
||||
asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
|
||||
/* lookup from pbmul[16] */
|
||||
asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
|
||||
asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
|
||||
asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
|
||||
asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
|
||||
/* vr4 - vr7: A(P + Pxy) */
|
||||
asm volatile("vxor.v $vr4, $vr4, $vr12");
|
||||
asm volatile("vxor.v $vr5, $vr5, $vr13");
|
||||
asm volatile("vxor.v $vr6, $vr6, $vr14");
|
||||
asm volatile("vxor.v $vr7, $vr7, $vr15");
|
||||
|
||||
/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
|
||||
asm volatile("vxor.v $vr4, $vr4, $vr16");
|
||||
asm volatile("vxor.v $vr5, $vr5, $vr17");
|
||||
asm volatile("vxor.v $vr6, $vr6, $vr18");
|
||||
asm volatile("vxor.v $vr7, $vr7, $vr19");
|
||||
asm volatile("vst $vr4, %0" : "=m" (dq[0]));
|
||||
asm volatile("vst $vr5, %0" : "=m" (dq[16]));
|
||||
asm volatile("vst $vr6, %0" : "=m" (dq[32]));
|
||||
asm volatile("vst $vr7, %0" : "=m" (dq[48]));
|
||||
|
||||
/* vr0 - vr3: P + Pxy + Dx = Dy */
|
||||
asm volatile("vxor.v $vr0, $vr0, $vr4");
|
||||
asm volatile("vxor.v $vr1, $vr1, $vr5");
|
||||
asm volatile("vxor.v $vr2, $vr2, $vr6");
|
||||
asm volatile("vxor.v $vr3, $vr3, $vr7");
|
||||
asm volatile("vst $vr0, %0" : "=m" (dp[0]));
|
||||
asm volatile("vst $vr1, %0" : "=m" (dp[16]));
|
||||
asm volatile("vst $vr2, %0" : "=m" (dp[32]));
|
||||
asm volatile("vst $vr3, %0" : "=m" (dp[48]));
|
||||
|
||||
bytes -= 64;
|
||||
p += 64;
|
||||
q += 64;
|
||||
dp += 64;
|
||||
dq += 64;
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
|
||||
void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dq;
|
||||
const u8 *qmul; /* Q multiplier table */
|
||||
|
||||
p = (u8 *)ptrs[disks - 2];
|
||||
q = (u8 *)ptrs[disks - 1];
|
||||
|
||||
/*
|
||||
* Compute syndrome with zero for the missing data page
|
||||
* Use the dead data page as temporary storage for delta q
|
||||
*/
|
||||
dq = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dq;
|
||||
ptrs[disks - 1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/* vr22, vr23: qmul */
|
||||
asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
|
||||
asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
|
||||
|
||||
while (bytes) {
|
||||
/* vr0 - vr3: P + Dx */
|
||||
asm volatile("vld $vr0, %0" : : "m" (p[0]));
|
||||
asm volatile("vld $vr1, %0" : : "m" (p[16]));
|
||||
asm volatile("vld $vr2, %0" : : "m" (p[32]));
|
||||
asm volatile("vld $vr3, %0" : : "m" (p[48]));
|
||||
/* vr4 - vr7: Qx */
|
||||
asm volatile("vld $vr4, %0" : : "m" (dq[0]));
|
||||
asm volatile("vld $vr5, %0" : : "m" (dq[16]));
|
||||
asm volatile("vld $vr6, %0" : : "m" (dq[32]));
|
||||
asm volatile("vld $vr7, %0" : : "m" (dq[48]));
|
||||
/* vr4 - vr7: Q + Qx */
|
||||
asm volatile("vld $vr8, %0" : : "m" (q[0]));
|
||||
asm volatile("vld $vr9, %0" : : "m" (q[16]));
|
||||
asm volatile("vld $vr10, %0" : : "m" (q[32]));
|
||||
asm volatile("vld $vr11, %0" : : "m" (q[48]));
|
||||
asm volatile("vxor.v $vr4, $vr4, $vr8");
|
||||
asm volatile("vxor.v $vr5, $vr5, $vr9");
|
||||
asm volatile("vxor.v $vr6, $vr6, $vr10");
|
||||
asm volatile("vxor.v $vr7, $vr7, $vr11");
|
||||
|
||||
/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
|
||||
asm volatile("vsrli.b $vr8, $vr4, 4");
|
||||
asm volatile("vsrli.b $vr9, $vr5, 4");
|
||||
asm volatile("vsrli.b $vr10, $vr6, 4");
|
||||
asm volatile("vsrli.b $vr11, $vr7, 4");
|
||||
/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
|
||||
asm volatile("vandi.b $vr4, $vr4, 0x0f");
|
||||
asm volatile("vandi.b $vr5, $vr5, 0x0f");
|
||||
asm volatile("vandi.b $vr6, $vr6, 0x0f");
|
||||
asm volatile("vandi.b $vr7, $vr7, 0x0f");
|
||||
/* lookup from qmul[0] */
|
||||
asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
|
||||
asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
|
||||
asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
|
||||
asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
|
||||
/* lookup from qmul[16] */
|
||||
asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
|
||||
asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
|
||||
asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
|
||||
asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
|
||||
/* vr4 - vr7: qmul(Q + Qx) = Dx */
|
||||
asm volatile("vxor.v $vr4, $vr4, $vr8");
|
||||
asm volatile("vxor.v $vr5, $vr5, $vr9");
|
||||
asm volatile("vxor.v $vr6, $vr6, $vr10");
|
||||
asm volatile("vxor.v $vr7, $vr7, $vr11");
|
||||
asm volatile("vst $vr4, %0" : "=m" (dq[0]));
|
||||
asm volatile("vst $vr5, %0" : "=m" (dq[16]));
|
||||
asm volatile("vst $vr6, %0" : "=m" (dq[32]));
|
||||
asm volatile("vst $vr7, %0" : "=m" (dq[48]));
|
||||
|
||||
/* vr0 - vr3: P + Dx + Dx = P */
|
||||
asm volatile("vxor.v $vr0, $vr0, $vr4");
|
||||
asm volatile("vxor.v $vr1, $vr1, $vr5");
|
||||
asm volatile("vxor.v $vr2, $vr2, $vr6");
|
||||
asm volatile("vxor.v $vr3, $vr3, $vr7");
|
||||
asm volatile("vst $vr0, %0" : "=m" (p[0]));
|
||||
asm volatile("vst $vr1, %0" : "=m" (p[16]));
|
||||
asm volatile("vst $vr2, %0" : "=m" (p[32]));
|
||||
asm volatile("vst $vr3, %0" : "=m" (p[48]));
|
||||
|
||||
bytes -= 64;
|
||||
p += 64;
|
||||
q += 64;
|
||||
dq += 64;
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_recov_calls raid6_recov_lsx = {
|
||||
.data2 = raid6_2data_recov_lsx,
|
||||
.datap = raid6_datap_recov_lsx,
|
||||
.valid = raid6_has_lsx,
|
||||
.name = "lsx",
|
||||
.priority = 1,
|
||||
};
|
||||
#endif /* CONFIG_CPU_HAS_LSX */
|
||||
|
||||
#ifdef CONFIG_CPU_HAS_LASX
|
||||
static int raid6_has_lasx(void)
|
||||
{
|
||||
return cpu_has_lasx;
|
||||
}
|
||||
|
||||
static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
|
||||
int failb, void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dp, *dq;
|
||||
const u8 *pbmul; /* P multiplier table for B data */
|
||||
const u8 *qmul; /* Q multiplier table (for both) */
|
||||
|
||||
p = (u8 *)ptrs[disks - 2];
|
||||
q = (u8 *)ptrs[disks - 1];
|
||||
|
||||
/*
|
||||
* Compute syndrome with zero for the missing data pages
|
||||
* Use the dead data pages as temporary storage for
|
||||
* delta p and delta q
|
||||
*/
|
||||
dp = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 2] = dp;
|
||||
dq = (u8 *)ptrs[failb];
|
||||
ptrs[failb] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dp;
|
||||
ptrs[failb] = dq;
|
||||
ptrs[disks - 2] = p;
|
||||
ptrs[disks - 1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
|
||||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/*
|
||||
* xr20, xr21: qmul
|
||||
* xr22, xr23: pbmul
|
||||
*/
|
||||
asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
|
||||
asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
|
||||
asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
|
||||
asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
|
||||
asm volatile("xvreplve0.q $xr20, $xr20");
|
||||
asm volatile("xvreplve0.q $xr21, $xr21");
|
||||
asm volatile("xvreplve0.q $xr22, $xr22");
|
||||
asm volatile("xvreplve0.q $xr23, $xr23");
|
||||
|
||||
while (bytes) {
|
||||
/* xr0, xr1: Q */
|
||||
asm volatile("xvld $xr0, %0" : : "m" (q[0]));
|
||||
asm volatile("xvld $xr1, %0" : : "m" (q[32]));
|
||||
/* xr0, xr1: Q + Qxy */
|
||||
asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
|
||||
asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
|
||||
asm volatile("xvxor.v $xr0, $xr0, $xr4");
|
||||
asm volatile("xvxor.v $xr1, $xr1, $xr5");
|
||||
/* xr2, xr3: P */
|
||||
asm volatile("xvld $xr2, %0" : : "m" (p[0]));
|
||||
asm volatile("xvld $xr3, %0" : : "m" (p[32]));
|
||||
/* xr2, xr3: P + Pxy */
|
||||
asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
|
||||
asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
|
||||
asm volatile("xvxor.v $xr2, $xr2, $xr4");
|
||||
asm volatile("xvxor.v $xr3, $xr3, $xr5");
|
||||
|
||||
/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
|
||||
asm volatile("xvsrli.b $xr4, $xr0, 4");
|
||||
asm volatile("xvsrli.b $xr5, $xr1, 4");
|
||||
/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
|
||||
asm volatile("xvandi.b $xr0, $xr0, 0x0f");
|
||||
asm volatile("xvandi.b $xr1, $xr1, 0x0f");
|
||||
/* lookup from qmul[0] */
|
||||
asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
|
||||
asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
|
||||
/* lookup from qmul[16] */
|
||||
asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
|
||||
asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
|
||||
/* xr6, xr7: B(Q + Qxy) */
|
||||
asm volatile("xvxor.v $xr6, $xr4, $xr0");
|
||||
asm volatile("xvxor.v $xr7, $xr5, $xr1");
|
||||
|
||||
/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
|
||||
asm volatile("xvsrli.b $xr4, $xr2, 4");
|
||||
asm volatile("xvsrli.b $xr5, $xr3, 4");
|
||||
/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
|
||||
asm volatile("xvandi.b $xr0, $xr2, 0x0f");
|
||||
asm volatile("xvandi.b $xr1, $xr3, 0x0f");
|
||||
/* lookup from pbmul[0] */
|
||||
asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
|
||||
asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
|
||||
/* lookup from pbmul[16] */
|
||||
asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
|
||||
asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
|
||||
/* xr0, xr1: A(P + Pxy) */
|
||||
asm volatile("xvxor.v $xr0, $xr0, $xr4");
|
||||
asm volatile("xvxor.v $xr1, $xr1, $xr5");
|
||||
|
||||
/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
|
||||
asm volatile("xvxor.v $xr0, $xr0, $xr6");
|
||||
asm volatile("xvxor.v $xr1, $xr1, $xr7");
|
||||
|
||||
/* xr2, xr3: P + Pxy + Dx = Dy */
|
||||
asm volatile("xvxor.v $xr2, $xr2, $xr0");
|
||||
asm volatile("xvxor.v $xr3, $xr3, $xr1");
|
||||
|
||||
asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
|
||||
asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
|
||||
asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
|
||||
asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
|
||||
|
||||
bytes -= 64;
|
||||
p += 64;
|
||||
q += 64;
|
||||
dp += 64;
|
||||
dq += 64;
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
|
||||
void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dq;
|
||||
const u8 *qmul; /* Q multiplier table */
|
||||
|
||||
p = (u8 *)ptrs[disks - 2];
|
||||
q = (u8 *)ptrs[disks - 1];
|
||||
|
||||
/*
|
||||
* Compute syndrome with zero for the missing data page
|
||||
* Use the dead data page as temporary storage for delta q
|
||||
*/
|
||||
dq = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks - 1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dq;
|
||||
ptrs[disks - 1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
/* xr22, xr23: qmul */
|
||||
asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
|
||||
asm volatile("xvreplve0.q $xr22, $xr22");
|
||||
asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
|
||||
asm volatile("xvreplve0.q $xr23, $xr23");
|
||||
|
||||
while (bytes) {
|
||||
/* xr0, xr1: P + Dx */
|
||||
asm volatile("xvld $xr0, %0" : : "m" (p[0]));
|
||||
asm volatile("xvld $xr1, %0" : : "m" (p[32]));
|
||||
/* xr2, xr3: Qx */
|
||||
asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
|
||||
asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
|
||||
/* xr2, xr3: Q + Qx */
|
||||
asm volatile("xvld $xr4, %0" : : "m" (q[0]));
|
||||
asm volatile("xvld $xr5, %0" : : "m" (q[32]));
|
||||
asm volatile("xvxor.v $xr2, $xr2, $xr4");
|
||||
asm volatile("xvxor.v $xr3, $xr3, $xr5");
|
||||
|
||||
/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
|
||||
asm volatile("xvsrli.b $xr4, $xr2, 4");
|
||||
asm volatile("xvsrli.b $xr5, $xr3, 4");
|
||||
/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
|
||||
asm volatile("xvandi.b $xr2, $xr2, 0x0f");
|
||||
asm volatile("xvandi.b $xr3, $xr3, 0x0f");
|
||||
/* lookup from qmul[0] */
|
||||
asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
|
||||
asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
|
||||
/* lookup from qmul[16] */
|
||||
asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
|
||||
asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
|
||||
/* xr2, xr3: qmul(Q + Qx) = Dx */
|
||||
asm volatile("xvxor.v $xr2, $xr2, $xr4");
|
||||
asm volatile("xvxor.v $xr3, $xr3, $xr5");
|
||||
|
||||
/* xr0, xr1: P + Dx + Dx = P */
|
||||
asm volatile("xvxor.v $xr0, $xr0, $xr2");
|
||||
asm volatile("xvxor.v $xr1, $xr1, $xr3");
|
||||
|
||||
asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
|
||||
asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
|
||||
asm volatile("xvst $xr0, %0" : "=m" (p[0]));
|
||||
asm volatile("xvst $xr1, %0" : "=m" (p[32]));
|
||||
|
||||
bytes -= 64;
|
||||
p += 64;
|
||||
q += 64;
|
||||
dq += 64;
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_recov_calls raid6_recov_lasx = {
|
||||
.data2 = raid6_2data_recov_lasx,
|
||||
.datap = raid6_datap_recov_lasx,
|
||||
.valid = raid6_has_lasx,
|
||||
.name = "lasx",
|
||||
.priority = 2,
|
||||
};
|
||||
#endif /* CONFIG_CPU_HAS_LASX */
|
@ -65,7 +65,7 @@ else ifeq ($(HAS_ALTIVEC),yes)
|
||||
OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
|
||||
vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
|
||||
else ifeq ($(ARCH),loongarch64)
|
||||
OBJS += loongarch_simd.o
|
||||
OBJS += loongarch_simd.o recov_loongarch_simd.o
|
||||
endif
|
||||
|
||||
.c.o:
|
||||
|
Loading…
Reference in New Issue
Block a user