2019-01-18 00:14:18 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2018-05-11 14:12:49 +02:00
|
|
|
/*
|
|
|
|
* AEGIS common definitions
|
|
|
|
*
|
|
|
|
* Copyright (c) 2018 Ondrej Mosnacek <omosnacek@gmail.com>
|
|
|
|
* Copyright (c) 2018 Red Hat, Inc. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _CRYPTO_AEGIS_H
|
|
|
|
#define _CRYPTO_AEGIS_H
|
|
|
|
|
|
|
|
#include <crypto/aes.h>
|
crypto: aegis - avoid prerotated AES tables
The generic AES code provides four sets of lookup tables, where each
set consists of four tables containing the same 32-bit values, but
rotated by 0, 8, 16 and 24 bits, respectively. This makes sense for
CISC architectures such as x86 which support memory operands, but
for other architectures, the rotates are quite cheap, and using all
four tables needlessly thrashes the D-cache, and actually hurts rather
than helps performance.
Since x86 already has its own implementation of AEGIS based on AES-NI
instructions, let's tweak the generic implementation towards other
architectures, and avoid the prerotated tables, and perform the
rotations inline. On ARM Cortex-A53, this results in a ~8% speedup.
Acked-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2019-07-03 10:55:09 +02:00
|
|
|
#include <linux/bitops.h>
|
2018-05-11 14:12:49 +02:00
|
|
|
#include <linux/types.h>
|
|
|
|
|
|
|
|
#define AEGIS_BLOCK_SIZE 16
|
|
|
|
|
|
|
|
union aegis_block {
|
|
|
|
__le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
|
2018-10-01 10:36:38 +02:00
|
|
|
__le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
|
2018-05-11 14:12:49 +02:00
|
|
|
u8 bytes[AEGIS_BLOCK_SIZE];
|
|
|
|
};
|
|
|
|
|
2021-03-08 16:41:32 +11:00
|
|
|
struct aegis_state;
|
|
|
|
|
|
|
|
extern int aegis128_have_aes_insn;
|
|
|
|
|
2018-05-11 14:12:49 +02:00
|
|
|
#define AEGIS_BLOCK_ALIGN (__alignof__(union aegis_block))
|
|
|
|
#define AEGIS_ALIGNED(p) IS_ALIGNED((uintptr_t)p, AEGIS_BLOCK_ALIGN)
|
|
|
|
|
2021-03-08 16:41:32 +11:00
|
|
|
bool crypto_aegis128_have_simd(void);
|
|
|
|
void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg);
|
|
|
|
void crypto_aegis128_init_simd(struct aegis_state *state,
|
|
|
|
const union aegis_block *key,
|
|
|
|
const u8 *iv);
|
|
|
|
void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
|
|
|
|
const u8 *src, unsigned int size);
|
|
|
|
void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
|
|
|
|
const u8 *src, unsigned int size);
|
|
|
|
int crypto_aegis128_final_simd(struct aegis_state *state,
|
|
|
|
union aegis_block *tag_xor,
|
|
|
|
unsigned int assoclen,
|
|
|
|
unsigned int cryptlen,
|
|
|
|
unsigned int authsize);
|
|
|
|
|
2019-07-18 15:50:04 +02:00
|
|
|
static __always_inline void crypto_aegis_block_xor(union aegis_block *dst,
|
|
|
|
const union aegis_block *src)
|
2018-05-11 14:12:49 +02:00
|
|
|
{
|
|
|
|
dst->words64[0] ^= src->words64[0];
|
|
|
|
dst->words64[1] ^= src->words64[1];
|
|
|
|
}
|
|
|
|
|
2019-07-18 15:50:04 +02:00
|
|
|
static __always_inline void crypto_aegis_block_and(union aegis_block *dst,
|
|
|
|
const union aegis_block *src)
|
2018-05-11 14:12:49 +02:00
|
|
|
{
|
|
|
|
dst->words64[0] &= src->words64[0];
|
|
|
|
dst->words64[1] &= src->words64[1];
|
|
|
|
}
|
|
|
|
|
2019-07-18 15:50:04 +02:00
|
|
|
static __always_inline void crypto_aegis_aesenc(union aegis_block *dst,
|
|
|
|
const union aegis_block *src,
|
|
|
|
const union aegis_block *key)
|
2018-05-11 14:12:49 +02:00
|
|
|
{
|
|
|
|
const u8 *s = src->bytes;
|
crypto: aegis - avoid prerotated AES tables
The generic AES code provides four sets of lookup tables, where each
set consists of four tables containing the same 32-bit values, but
rotated by 0, 8, 16 and 24 bits, respectively. This makes sense for
CISC architectures such as x86 which support memory operands, but
for other architectures, the rotates are quite cheap, and using all
four tables needlessly thrashes the D-cache, and actually hurts rather
than helps performance.
Since x86 already has its own implementation of AEGIS based on AES-NI
instructions, let's tweak the generic implementation towards other
architectures, and avoid the prerotated tables, and perform the
rotations inline. On ARM Cortex-A53, this results in a ~8% speedup.
Acked-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2019-07-03 10:55:09 +02:00
|
|
|
const u32 *t = crypto_ft_tab[0];
|
2018-05-11 14:12:49 +02:00
|
|
|
u32 d0, d1, d2, d3;
|
|
|
|
|
crypto: aegis - avoid prerotated AES tables
The generic AES code provides four sets of lookup tables, where each
set consists of four tables containing the same 32-bit values, but
rotated by 0, 8, 16 and 24 bits, respectively. This makes sense for
CISC architectures such as x86 which support memory operands, but
for other architectures, the rotates are quite cheap, and using all
four tables needlessly thrashes the D-cache, and actually hurts rather
than helps performance.
Since x86 already has its own implementation of AEGIS based on AES-NI
instructions, let's tweak the generic implementation towards other
architectures, and avoid the prerotated tables, and perform the
rotations inline. On ARM Cortex-A53, this results in a ~8% speedup.
Acked-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2019-07-03 10:55:09 +02:00
|
|
|
d0 = t[s[ 0]] ^ rol32(t[s[ 5]], 8) ^ rol32(t[s[10]], 16) ^ rol32(t[s[15]], 24);
|
|
|
|
d1 = t[s[ 4]] ^ rol32(t[s[ 9]], 8) ^ rol32(t[s[14]], 16) ^ rol32(t[s[ 3]], 24);
|
|
|
|
d2 = t[s[ 8]] ^ rol32(t[s[13]], 8) ^ rol32(t[s[ 2]], 16) ^ rol32(t[s[ 7]], 24);
|
|
|
|
d3 = t[s[12]] ^ rol32(t[s[ 1]], 8) ^ rol32(t[s[ 6]], 16) ^ rol32(t[s[11]], 24);
|
2018-05-11 14:12:49 +02:00
|
|
|
|
2018-10-01 10:36:38 +02:00
|
|
|
dst->words32[0] = cpu_to_le32(d0) ^ key->words32[0];
|
|
|
|
dst->words32[1] = cpu_to_le32(d1) ^ key->words32[1];
|
|
|
|
dst->words32[2] = cpu_to_le32(d2) ^ key->words32[2];
|
|
|
|
dst->words32[3] = cpu_to_le32(d3) ^ key->words32[3];
|
2018-05-11 14:12:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* _CRYPTO_AEGIS_H */
|