linux-next/arch/x86/crypto/aegis256-aesni-asm.S
Linus Torvalds f24d6f2654 Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm updates from Thomas Gleixner:
 "The lowlevel and ASM code updates for x86:

   - Make stack trace unwinding more reliable

   - ASM instruction updates for better code generation

   - Various cleanups"

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry/64: Add two more instruction suffixes
  x86/asm/64: Use 32-bit XOR to zero registers
  x86/build/vdso: Simplify 'cmd_vdso2c'
  x86/build/vdso: Remove unused vdso-syms.lds
  x86/stacktrace: Enable HAVE_RELIABLE_STACKTRACE for the ORC unwinder
  x86/unwind/orc: Detect the end of the stack
  x86/stacktrace: Do not fail for ORC with regs on stack
  x86/stacktrace: Clarify the reliable success paths
  x86/stacktrace: Remove STACKTRACE_DUMP_ONCE
  x86/stacktrace: Do not unwind after user regs
  x86/asm: Use CC_SET/CC_OUT in percpu_cmpxchg8b_double() to micro-optimize code generation
2018-08-13 13:35:26 -07:00

704 lines
10 KiB
ArmAsm

/*
* AES-NI + SSE2 implementation of AEGIS-128L
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define STATE0 %xmm0
#define STATE1 %xmm1
#define STATE2 %xmm2
#define STATE3 %xmm3
#define STATE4 %xmm4
#define STATE5 %xmm5
#define MSG %xmm6
#define T0 %xmm7
#define T1 %xmm8
#define T2 %xmm9
#define T3 %xmm10
#define STATEP %rdi
#define LEN %rsi
#define SRC %rdx
#define DST %rcx
.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
.align 16
.Laegis256_const_0:
.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Laegis256_const_1:
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
.align 16
.Laegis256_counter:
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.text
/*
* __load_partial: internal ABI
* input:
* LEN - bytes
* SRC - src
* output:
* MSG - message block
* changed:
* T0
* %r8
* %r9
*/
__load_partial:
xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
and $0x1, %r8
jz .Lld_partial_1
mov LEN, %r8
and $0x1E, %r8
add SRC, %r8
mov (%r8), %r9b
.Lld_partial_1:
mov LEN, %r8
and $0x2, %r8
jz .Lld_partial_2
mov LEN, %r8
and $0x1C, %r8
add SRC, %r8
shl $0x10, %r9
mov (%r8), %r9w
.Lld_partial_2:
mov LEN, %r8
and $0x4, %r8
jz .Lld_partial_4
mov LEN, %r8
and $0x18, %r8
add SRC, %r8
shl $32, %r9
mov (%r8), %r8d
xor %r8, %r9
.Lld_partial_4:
movq %r9, MSG
mov LEN, %r8
and $0x8, %r8
jz .Lld_partial_8
mov LEN, %r8
and $0x10, %r8
add SRC, %r8
pslldq $8, MSG
movq (%r8), T0
pxor T0, MSG
.Lld_partial_8:
ret
ENDPROC(__load_partial)
/*
* __store_partial: internal ABI
* input:
* LEN - bytes
* DST - dst
* output:
* T0 - message block
* changed:
* %r8
* %r9
* %r10
*/
__store_partial:
mov LEN, %r8
mov DST, %r9
movq T0, %r10
cmp $8, %r8
jl .Lst_partial_8
mov %r10, (%r9)
psrldq $8, T0
movq T0, %r10
sub $8, %r8
add $8, %r9
.Lst_partial_8:
cmp $4, %r8
jl .Lst_partial_4
mov %r10d, (%r9)
shr $32, %r10
sub $4, %r8
add $4, %r9
.Lst_partial_4:
cmp $2, %r8
jl .Lst_partial_2
mov %r10w, (%r9)
shr $0x10, %r10
sub $2, %r8
add $2, %r9
.Lst_partial_2:
cmp $1, %r8
jl .Lst_partial_1
mov %r10b, (%r9)
.Lst_partial_1:
ret
ENDPROC(__store_partial)
.macro update
movdqa STATE5, T0
aesenc STATE0, STATE5
aesenc STATE1, STATE0
aesenc STATE2, STATE1
aesenc STATE3, STATE2
aesenc STATE4, STATE3
aesenc T0, STATE4
.endm
.macro update0 m
update
pxor \m, STATE5
.endm
.macro update1 m
update
pxor \m, STATE4
.endm
.macro update2 m
update
pxor \m, STATE3
.endm
.macro update3 m
update
pxor \m, STATE2
.endm
.macro update4 m
update
pxor \m, STATE1
.endm
.macro update5 m
update
pxor \m, STATE0
.endm
.macro state_load
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
movdqu 0x50(STATEP), STATE5
.endm
.macro state_store s0 s1 s2 s3 s4 s5
movdqu \s5, 0x00(STATEP)
movdqu \s0, 0x10(STATEP)
movdqu \s1, 0x20(STATEP)
movdqu \s2, 0x30(STATEP)
movdqu \s3, 0x40(STATEP)
movdqu \s4, 0x50(STATEP)
.endm
.macro state_store0
state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
.endm
.macro state_store1
state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
.endm
.macro state_store2
state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
.endm
.macro state_store3
state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
.endm
.macro state_store4
state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
.endm
.macro state_store5
state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
.endm
/*
* void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
*/
ENTRY(crypto_aegis256_aesni_init)
FRAME_BEGIN
/* load key: */
movdqa 0x00(%rsi), MSG
movdqa 0x10(%rsi), T1
movdqa MSG, STATE4
movdqa T1, STATE5
/* load IV: */
movdqu 0x00(%rdx), T2
movdqu 0x10(%rdx), T3
pxor MSG, T2
pxor T1, T3
movdqa T2, STATE0
movdqa T3, STATE1
/* load the constants: */
movdqa .Laegis256_const_0, STATE3
movdqa .Laegis256_const_1, STATE2
pxor STATE3, STATE4
pxor STATE2, STATE5
/* update 10 times with IV and KEY: */
update0 MSG
update1 T1
update2 T2
update3 T3
update4 MSG
update5 T1
update0 T2
update1 T3
update2 MSG
update3 T1
update4 T2
update5 T3
update0 MSG
update1 T1
update2 T2
update3 T3
state_store3
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_init)
.macro ad_block a i
movdq\a (\i * 0x10)(SRC), MSG
update\i MSG
sub $0x10, LEN
cmp $0x10, LEN
jl .Lad_out_\i
.endm
/*
* void crypto_aegis256_aesni_ad(void *state, unsigned int length,
* const void *data);
*/
ENTRY(crypto_aegis256_aesni_ad)
FRAME_BEGIN
cmp $0x10, LEN
jb .Lad_out
state_load
mov SRC, %r8
and $0xf, %r8
jnz .Lad_u_loop
.align 8
.Lad_a_loop:
ad_block a 0
ad_block a 1
ad_block a 2
ad_block a 3
ad_block a 4
ad_block a 5
add $0x60, SRC
jmp .Lad_a_loop
.align 8
.Lad_u_loop:
ad_block u 0
ad_block u 1
ad_block u 2
ad_block u 3
ad_block u 4
ad_block u 5
add $0x60, SRC
jmp .Lad_u_loop
.Lad_out_0:
state_store0
FRAME_END
ret
.Lad_out_1:
state_store1
FRAME_END
ret
.Lad_out_2:
state_store2
FRAME_END
ret
.Lad_out_3:
state_store3
FRAME_END
ret
.Lad_out_4:
state_store4
FRAME_END
ret
.Lad_out_5:
state_store5
FRAME_END
ret
.Lad_out:
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_ad)
.macro crypt m s0 s1 s2 s3 s4 s5
pxor \s1, \m
pxor \s4, \m
pxor \s5, \m
movdqa \s2, T3
pand \s3, T3
pxor T3, \m
.endm
.macro crypt0 m
crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
.endm
.macro crypt1 m
crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
.endm
.macro crypt2 m
crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
.endm
.macro crypt3 m
crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
.endm
.macro crypt4 m
crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
.endm
.macro crypt5 m
crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
.endm
.macro encrypt_block a i
movdq\a (\i * 0x10)(SRC), MSG
movdqa MSG, T0
crypt\i T0
movdq\a T0, (\i * 0x10)(DST)
update\i MSG
sub $0x10, LEN
cmp $0x10, LEN
jl .Lenc_out_\i
.endm
.macro decrypt_block a i
movdq\a (\i * 0x10)(SRC), MSG
crypt\i MSG
movdq\a MSG, (\i * 0x10)(DST)
update\i MSG
sub $0x10, LEN
cmp $0x10, LEN
jl .Ldec_out_\i
.endm
/*
* void crypto_aegis256_aesni_enc(void *state, unsigned int length,
* const void *src, void *dst);
*/
ENTRY(crypto_aegis256_aesni_enc)
FRAME_BEGIN
cmp $0x10, LEN
jb .Lenc_out
state_load
mov SRC, %r8
or DST, %r8
and $0xf, %r8
jnz .Lenc_u_loop
.align 8
.Lenc_a_loop:
encrypt_block a 0
encrypt_block a 1
encrypt_block a 2
encrypt_block a 3
encrypt_block a 4
encrypt_block a 5
add $0x60, SRC
add $0x60, DST
jmp .Lenc_a_loop
.align 8
.Lenc_u_loop:
encrypt_block u 0
encrypt_block u 1
encrypt_block u 2
encrypt_block u 3
encrypt_block u 4
encrypt_block u 5
add $0x60, SRC
add $0x60, DST
jmp .Lenc_u_loop
.Lenc_out_0:
state_store0
FRAME_END
ret
.Lenc_out_1:
state_store1
FRAME_END
ret
.Lenc_out_2:
state_store2
FRAME_END
ret
.Lenc_out_3:
state_store3
FRAME_END
ret
.Lenc_out_4:
state_store4
FRAME_END
ret
.Lenc_out_5:
state_store5
FRAME_END
ret
.Lenc_out:
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_enc)
/*
* void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
ENTRY(crypto_aegis256_aesni_enc_tail)
FRAME_BEGIN
state_load
/* encrypt message: */
call __load_partial
movdqa MSG, T0
crypt0 T0
call __store_partial
update0 MSG
state_store0
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_enc_tail)
/*
* void crypto_aegis256_aesni_dec(void *state, unsigned int length,
* const void *src, void *dst);
*/
ENTRY(crypto_aegis256_aesni_dec)
FRAME_BEGIN
cmp $0x10, LEN
jb .Ldec_out
state_load
mov SRC, %r8
or DST, %r8
and $0xF, %r8
jnz .Ldec_u_loop
.align 8
.Ldec_a_loop:
decrypt_block a 0
decrypt_block a 1
decrypt_block a 2
decrypt_block a 3
decrypt_block a 4
decrypt_block a 5
add $0x60, SRC
add $0x60, DST
jmp .Ldec_a_loop
.align 8
.Ldec_u_loop:
decrypt_block u 0
decrypt_block u 1
decrypt_block u 2
decrypt_block u 3
decrypt_block u 4
decrypt_block u 5
add $0x60, SRC
add $0x60, DST
jmp .Ldec_u_loop
.Ldec_out_0:
state_store0
FRAME_END
ret
.Ldec_out_1:
state_store1
FRAME_END
ret
.Ldec_out_2:
state_store2
FRAME_END
ret
.Ldec_out_3:
state_store3
FRAME_END
ret
.Ldec_out_4:
state_store4
FRAME_END
ret
.Ldec_out_5:
state_store5
FRAME_END
ret
.Ldec_out:
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_dec)
/*
* void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
ENTRY(crypto_aegis256_aesni_dec_tail)
FRAME_BEGIN
state_load
/* decrypt message: */
call __load_partial
crypt0 MSG
movdqa MSG, T0
call __store_partial
/* mask with byte count: */
movq LEN, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
movdqa .Laegis256_counter, T1
pcmpgtb T1, T0
pand T0, MSG
update0 MSG
state_store0
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_dec_tail)
/*
* void crypto_aegis256_aesni_final(void *state, void *tag_xor,
* u64 assoclen, u64 cryptlen);
*/
ENTRY(crypto_aegis256_aesni_final)
FRAME_BEGIN
state_load
/* prepare length block: */
movq %rdx, MSG
movq %rcx, T0
pslldq $8, T0
pxor T0, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
/* update state: */
update0 MSG
update1 MSG
update2 MSG
update3 MSG
update4 MSG
update5 MSG
update0 MSG
/* xor tag: */
movdqu (%rsi), MSG
pxor STATE0, MSG
pxor STATE1, MSG
pxor STATE2, MSG
pxor STATE3, MSG
pxor STATE4, MSG
pxor STATE5, MSG
movdqu MSG, (%rsi)
FRAME_END
ret
ENDPROC(crypto_aegis256_aesni_final)