linux-next/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
Jerry Shih eb24af5d7a
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}
Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well
as bare (single-block) AES, using the RISC-V vector crypto extensions.
The assembly code is derived from OpenSSL code (openssl/openssl#21923)
that was dual-licensed so that it could be reused in the kernel.
Nevertheless, the assembly has been significantly reworked for
integration with the kernel, for example by using regular .S files
instead of the so-called perlasm, using the assembler instead of bare
'.inst', greatly reducing code duplication, supporting AES-192, and
making the code use the same AES key structure as the C code.

Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com>
Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com>
Signed-off-by: Jerry Shih <jerry.shih@sifive.com>
Co-developed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2024-01-22 17:55:18 -08:00

313 lines
10 KiB
ArmAsm

/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')
#include <linux/linkage.h>
.text
.option arch, +zvkned, +zvbb, +zvkg
#include "aes-macros.S"
#define KEYP a0
#define INP a1
#define OUTP a2
#define LEN a3
#define TWEAKP a4
#define LEN32 a5
#define TAIL_LEN a6
#define VL a7
#define VLMAX t4
// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS v16 // LMUL=4 (most of the time)
#define TWEAKS_BREV v20 // LMUL=4 (most of the time)
#define MULTS_BREV v24 // LMUL=4 (most of the time)
#define TMP0 v28
#define TMP1 v29
#define TMP2 v30
#define TMP3 v31
// xts_init initializes the following values:
//
// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
// TWEAKS_BREV: same as TWEAKS, but bit-reversed
// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte. The zvkg extension provides the vgmul
// instruction which does multiplication in this field. Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring. Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro xts_init
// Load the first tweak T.
vsetivli zero, 4, e32, m1, ta, ma
vle32.v TWEAKS, (TWEAKP)
// If there's only one block (or no blocks at all), then skip the tweak
// sequence computation because (at most) T itself is needed.
li t0, 16
ble LEN, t0, .Linit_single_block\@
// Save a copy of T bit-reversed in v12.
vbrev8.v v12, TWEAKS
//
// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
// that N <= 128. Though, this code actually requires N < 64 (or
// equivalently VLEN < 2048) due to the use of 64-bit intermediate
// values here and in the x^N computation later.
//
vsetvli VL, LEN32, e32, m4, ta, ma
srli t0, VL, 2 // t0 = N (num blocks)
// Generate two sequences, each with N 32-bit values:
// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
vsetvli zero, t0, e32, m1, ta, ma
vmv.v.i v0, 1
vid.v v1
// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
// as two sequences, each with 2*N 32-bit values:
// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
vsetvli zero, t0, e64, m2, ta, ma
vzext.vf2 v2, v0
vzext.vf2 v4, v1
slli t1, t0, 1 // t1 = 2*N
vsetvli zero, t1, e32, m2, ta, ma
// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
// widening to 64 bits per element. When reinterpreted as N 128-bit
// values, this is the needed sequence of 128-bit values 1 << i (x^i).
vwsll.vv v8, v2, v4
// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i TWEAKS_BREV, 0
vaesz.vs TWEAKS_BREV, v12
vbrev8.v v8, v8
vgmul.vv TWEAKS_BREV, v8
// Save a copy of the sequence T*(x^i) with the bit reversal undone.
vbrev8.v TWEAKS, TWEAKS_BREV
// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
li t1, 1
sll t1, t1, t0 // t1 = 1 << N
vsetivli zero, 2, e64, m1, ta, ma
vmv.v.i v0, 0
vsetivli zero, 1, e64, m1, tu, ma
vmv.v.x v0, t1
vbrev8.v v0, v0
vsetvli zero, LEN32, e32, m4, ta, ma
vmv.v.i MULTS_BREV, 0
vaesz.vs MULTS_BREV, v0
j .Linit_done\@
.Linit_single_block\@:
vbrev8.v TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm
// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
// the multiplier required to advance the tweak by one.
.macro load_x
li t0, 0x40
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.i MULTS_BREV, 0
vsetivli zero, 1, e8, m1, tu, ma
vmv.v.x MULTS_BREV, t0
.endm
.macro __aes_xts_crypt enc, keylen
// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
beqz LEN32, .Lcts_without_main_loop\@
vsetvli VLMAX, zero, e32, m4, ta, ma
1:
vsetvli VL, LEN32, e32, m4, ta, ma
2:
// Encrypt or decrypt VL/4 blocks.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TWEAKS
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TWEAKS
vse32.v TMP0, (OUTP)
// Update the pointers and the remaining length.
slli t0, VL, 2
add INP, INP, t0
add OUTP, OUTP, t0
sub LEN32, LEN32, VL
// Check whether more blocks remain.
beqz LEN32, .Lmain_loop_done\@
// Compute the next sequence of tweaks by multiplying the previous
// sequence by x^N. Store the result in both bit-reversed order and
// regular order (i.e. with the bit reversal undone).
vgmul.vv TWEAKS_BREV, MULTS_BREV
vbrev8.v TWEAKS, TWEAKS_BREV
// Since we compute the tweak multipliers x^N in advance, we require
// that each iteration process the same length except possibly the last.
// This conflicts slightly with the behavior allowed by RISC-V Vector
// Extension, where CPUs can select a lower length for both of the last
// two iterations. E.g., vl might take the sequence of values
// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
// can use x^4 again instead of computing x^3. Therefore, we explicitly
// keep the vl at VLMAX if there is at least VLMAX remaining.
bge LEN32, VLMAX, 2b
j 1b
.Lmain_loop_done\@:
load_x
// Compute the next tweak.
addi t0, VL, -4
vsetivli zero, 4, e32, m4, ta, ma
vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
vsetivli zero, 4, e32, m1, ta, ma
vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
bnez TAIL_LEN, .Lcts\@
// Update *TWEAKP to contain the next tweak.
vbrev8.v TWEAKS, TWEAKS_BREV
vse32.v TWEAKS, (TWEAKP)
ret
.Lcts_without_main_loop\@:
load_x
.Lcts\@:
// TWEAKS_BREV now contains the next tweak. Compute the one after that.
vsetivli zero, 4, e32, m1, ta, ma
vmv.v.v TMP0, TWEAKS_BREV
vgmul.vv TMP0, MULTS_BREV
// Undo the bit reversal of the next two tweaks and store them in TMP1
// and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
vbrev8.v TMP1, TWEAKS_BREV
vbrev8.v TMP2, TMP0
.else
vbrev8.v TMP1, TMP0
vbrev8.v TMP2, TWEAKS_BREV
.endif
// Encrypt/decrypt the last full block.
vle32.v TMP0, (INP)
vxor.vv TMP0, TMP0, TMP1
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP1
// Swap the first TAIL_LEN bytes of the above result with the tail.
// Note that to support in-place encryption/decryption, the load from
// the input tail must happen before the store to the output tail.
addi t0, INP, 16
addi t1, OUTP, 16
vmv.v.v TMP3, TMP0
vsetvli zero, TAIL_LEN, e8, m1, tu, ma
vle8.v TMP0, (t0)
vse8.v TMP3, (t1)
// Encrypt/decrypt again and store the last full block.
vsetivli zero, 4, e32, m1, ta, ma
vxor.vv TMP0, TMP0, TMP2
aes_crypt TMP0, \enc, \keylen
vxor.vv TMP0, TMP0, TMP2
vse32.v TMP0, (OUTP)
ret
.endm
.macro aes_xts_crypt enc
// Check whether the length is a multiple of the AES block size.
andi TAIL_LEN, LEN, 15
beqz TAIL_LEN, 1f
// The length isn't a multiple of the AES block size, so ciphertext
// stealing will be required. Ciphertext stealing involves special
// handling of the partial block and the last full block, so subtract
// the length of both from the length to be processed in the main loop.
sub LEN, LEN, TAIL_LEN
addi LEN, LEN, -16
1:
srli LEN32, LEN, 2
// LEN and LEN32 now contain the total length of the blocks that will be
// processed in the main loop, in bytes and 32-bit words respectively.
xts_init
aes_begin KEYP, 128f, 192f
__aes_xts_crypt \enc, 256
128:
__aes_xts_crypt \enc, 128
192:
__aes_xts_crypt \enc, 192
.endm
// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
// const u8 *in, u8 *out, size_t len,
// u8 tweak[16]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call. If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)