mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-19 20:05:08 +00:00
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: - XTS mode optimisation for twofish/cast6/camellia/aes on x86 - AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia - SSSE3/AVX/AVX2 optimisations for sha256/sha512 - Added driver for SAHARA2 crypto accelerator - Fix for GMAC when used in non-IPsec secnarios - Added generic CMAC implementation (including IPsec glue) - IP update for crypto/atmel - Support for more than one device in hwrng/timeriomem - Added Broadcom BCM2835 RNG driver - Misc fixes * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits) crypto: caam - fix job ring cleanup code crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher crypto: tcrypt - add async cipher speed tests for blowfish crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2 crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86 crypto: aesni_intel - add more optimized XTS mode for x86-64 crypto: x86/camellia-aesni-avx - add more optimized XTS code crypto: cast6-avx: use new optimized XTS code crypto: x86/twofish-avx - use optimized XTS code crypto: x86 - add more optimized XTS-mode for serpent-avx xfrm: add rfc4494 AES-CMAC-96 support crypto: add CMAC support to CryptoAPI crypto: testmgr - add empty test vectors for null ciphers crypto: testmgr - add AES GMAC test vectors crypto: gcm - fix rfc4543 to handle async crypto correctly crypto: gcm - make GMAC work when dst and src are different hwrng: timeriomem - added devicetree hooks ...
This commit is contained in:
commit
797994f81a
15
Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt
Normal file
15
Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt
Normal file
@ -0,0 +1,15 @@
|
||||
Freescale SAHARA Cryptographic Accelerator included in some i.MX chips.
|
||||
Currently only i.MX27 is supported.
|
||||
|
||||
Required properties:
|
||||
- compatible : Should be "fsl,<soc>-sahara"
|
||||
- reg : Should contain SAHARA registers location and length
|
||||
- interrupts : Should contain SAHARA interrupt number
|
||||
|
||||
Example:
|
||||
|
||||
sah@10025000 {
|
||||
compatible = "fsl,imx27-sahara";
|
||||
reg = < 0x10025000 0x800>;
|
||||
interrupts = <75>;
|
||||
};
|
18
Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt
Normal file
18
Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt
Normal file
@ -0,0 +1,18 @@
|
||||
HWRNG support for the timeriomem_rng driver
|
||||
|
||||
Required properties:
|
||||
- compatible : "timeriomem_rng"
|
||||
- reg : base address to sample from
|
||||
- period : wait time in microseconds to use between samples
|
||||
|
||||
N.B. currently 'reg' must be four bytes wide and aligned
|
||||
|
||||
Example:
|
||||
|
||||
hwrng@44 {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
compatible = "timeriomem_rng";
|
||||
reg = <0x44 0x04>;
|
||||
period = <1000000>;
|
||||
};
|
13
Documentation/devicetree/bindings/rng/brcm,bcm2835.txt
Normal file
13
Documentation/devicetree/bindings/rng/brcm,bcm2835.txt
Normal file
@ -0,0 +1,13 @@
|
||||
BCM2835 Random number generator
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible : should be "brcm,bcm2835-rng"
|
||||
- reg : Specifies base physical address and size of the registers.
|
||||
|
||||
Example:
|
||||
|
||||
rng {
|
||||
compatible = "brcm,bcm2835-rng";
|
||||
reg = <0x7e104000 0x10>;
|
||||
};
|
@ -63,7 +63,7 @@ Intel RNG Driver notes:
|
||||
|
||||
* FIXME: support poll(2)
|
||||
|
||||
NOTE: request_mem_region was removed, for two reasons:
|
||||
NOTE: request_mem_region was removed, for three reasons:
|
||||
1) Only one RNG is supported by this driver, 2) The location
|
||||
used by the RNG is a fixed location in MMIO-addressable memory,
|
||||
3) users with properly working BIOS e820 handling will always
|
||||
|
@ -18,7 +18,7 @@
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/i2c-gpio.h>
|
||||
#include <linux/atmel-mci.h>
|
||||
#include <linux/platform_data/atmel-aes.h>
|
||||
#include <linux/platform_data/crypto-atmel.h>
|
||||
|
||||
#include <linux/platform_data/at91_adc.h>
|
||||
|
||||
@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
|
||||
* -------------------------------------------------------------------- */
|
||||
|
||||
#if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
|
||||
static struct aes_platform_data aes_data;
|
||||
static struct crypto_platform_data aes_data;
|
||||
static struct crypto_dma_data alt_atslave;
|
||||
static u64 aes_dmamask = DMA_BIT_MASK(32);
|
||||
|
||||
static struct resource aes_resources[] = {
|
||||
@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
|
||||
static void __init at91_add_device_aes(void)
|
||||
{
|
||||
struct at_dma_slave *atslave;
|
||||
struct aes_dma_data *alt_atslave;
|
||||
|
||||
alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
|
||||
|
||||
/* DMA TX slave channel configuration */
|
||||
atslave = &alt_atslave->txdata;
|
||||
atslave = &alt_atslave.txdata;
|
||||
atslave->dma_dev = &at_hdmac_device.dev;
|
||||
atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW |
|
||||
ATC_SRC_PER(AT_DMA_ID_AES_RX);
|
||||
|
||||
/* DMA RX slave channel configuration */
|
||||
atslave = &alt_atslave->rxdata;
|
||||
atslave = &alt_atslave.rxdata;
|
||||
atslave->dma_dev = &at_hdmac_device.dev;
|
||||
atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW |
|
||||
ATC_DST_PER(AT_DMA_ID_AES_TX);
|
||||
|
||||
aes_data.dma_slave = alt_atslave;
|
||||
aes_data.dma_slave = &alt_atslave;
|
||||
platform_device_register(&at91sam9g45_aes_device);
|
||||
}
|
||||
#else
|
||||
|
@ -2,6 +2,10 @@
|
||||
# Arch-specific CryptoAPI modules.
|
||||
#
|
||||
|
||||
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
||||
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
||||
$(comma)4)$(comma)%ymm2,yes,no)
|
||||
|
||||
obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
|
||||
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
|
||||
|
||||
@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
|
||||
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
|
||||
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
|
||||
|
||||
# These modules require assembler to support AVX.
|
||||
ifeq ($(avx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
||||
camellia-aesni-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
||||
endif
|
||||
|
||||
# These modules require assembler to support AVX2.
|
||||
ifeq ($(avx2_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
|
||||
endif
|
||||
|
||||
aes-i586-y := aes-i586-asm_32.o aes_glue.o
|
||||
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
|
||||
@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
|
||||
|
||||
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
|
||||
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
|
||||
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
|
||||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||
twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
|
||||
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
|
||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||
serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
|
||||
|
||||
ifeq ($(avx_supported),yes)
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
|
||||
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
|
||||
twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
|
||||
twofish_avx_glue.o
|
||||
serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
|
||||
serpent_avx_glue.o
|
||||
endif
|
||||
|
||||
ifeq ($(avx2_supported),yes)
|
||||
blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
|
||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||
twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
|
||||
endif
|
||||
|
||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||
crc32c-intel-y := crc32c-intel_glue.o
|
||||
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
|
||||
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
|
||||
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
|
||||
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
|
||||
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
||||
|
@ -34,6 +34,10 @@
|
||||
|
||||
#ifdef __x86_64__
|
||||
.data
|
||||
.align 16
|
||||
.Lgf128mul_x_ble_mask:
|
||||
.octa 0x00000000000000010000000000000087
|
||||
|
||||
POLY: .octa 0xC2000000000000000000000000000001
|
||||
TWOONE: .octa 0x00000001000000000000000000000001
|
||||
|
||||
@ -105,6 +109,8 @@ enc: .octa 0x2
|
||||
#define CTR %xmm11
|
||||
#define INC %xmm12
|
||||
|
||||
#define GF128MUL_MASK %xmm10
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define AREG %rax
|
||||
#define KEYP %rdi
|
||||
@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
|
||||
.Lctr_enc_just_ret:
|
||||
ret
|
||||
ENDPROC(aesni_ctr_enc)
|
||||
|
||||
/*
|
||||
* _aesni_gf128mul_x_ble: internal ABI
|
||||
* Multiply in GF(2^128) for XTS IVs
|
||||
* input:
|
||||
* IV: current IV
|
||||
* GF128MUL_MASK == mask with 0x87 and 0x01
|
||||
* output:
|
||||
* IV: next IV
|
||||
* changed:
|
||||
* CTR: == temporary value
|
||||
*/
|
||||
#define _aesni_gf128mul_x_ble() \
|
||||
pshufd $0x13, IV, CTR; \
|
||||
paddq IV, IV; \
|
||||
psrad $31, CTR; \
|
||||
pand GF128MUL_MASK, CTR; \
|
||||
pxor CTR, IV;
|
||||
|
||||
/*
|
||||
* void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
|
||||
* bool enc, u8 *iv)
|
||||
*/
|
||||
ENTRY(aesni_xts_crypt8)
|
||||
cmpb $0, %cl
|
||||
movl $0, %ecx
|
||||
movl $240, %r10d
|
||||
leaq _aesni_enc4, %r11
|
||||
leaq _aesni_dec4, %rax
|
||||
cmovel %r10d, %ecx
|
||||
cmoveq %rax, %r11
|
||||
|
||||
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
|
||||
movups (IVP), IV
|
||||
|
||||
mov 480(KEYP), KLEN
|
||||
addq %rcx, KEYP
|
||||
|
||||
movdqa IV, STATE1
|
||||
pxor 0x00(INP), STATE1
|
||||
movdqu IV, 0x00(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE2
|
||||
pxor 0x10(INP), STATE2
|
||||
movdqu IV, 0x10(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE3
|
||||
pxor 0x20(INP), STATE3
|
||||
movdqu IV, 0x20(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE4
|
||||
pxor 0x30(INP), STATE4
|
||||
movdqu IV, 0x30(OUTP)
|
||||
|
||||
call *%r11
|
||||
|
||||
pxor 0x00(OUTP), STATE1
|
||||
movdqu STATE1, 0x00(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE1
|
||||
pxor 0x40(INP), STATE1
|
||||
movdqu IV, 0x40(OUTP)
|
||||
|
||||
pxor 0x10(OUTP), STATE2
|
||||
movdqu STATE2, 0x10(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE2
|
||||
pxor 0x50(INP), STATE2
|
||||
movdqu IV, 0x50(OUTP)
|
||||
|
||||
pxor 0x20(OUTP), STATE3
|
||||
movdqu STATE3, 0x20(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE3
|
||||
pxor 0x60(INP), STATE3
|
||||
movdqu IV, 0x60(OUTP)
|
||||
|
||||
pxor 0x30(OUTP), STATE4
|
||||
movdqu STATE4, 0x30(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movdqa IV, STATE4
|
||||
pxor 0x70(INP), STATE4
|
||||
movdqu IV, 0x70(OUTP)
|
||||
|
||||
_aesni_gf128mul_x_ble()
|
||||
movups IV, (IVP)
|
||||
|
||||
call *%r11
|
||||
|
||||
pxor 0x40(OUTP), STATE1
|
||||
movdqu STATE1, 0x40(OUTP)
|
||||
|
||||
pxor 0x50(OUTP), STATE2
|
||||
movdqu STATE2, 0x50(OUTP)
|
||||
|
||||
pxor 0x60(OUTP), STATE3
|
||||
movdqu STATE3, 0x60(OUTP)
|
||||
|
||||
pxor 0x70(OUTP), STATE4
|
||||
movdqu STATE4, 0x70(OUTP)
|
||||
|
||||
ret
|
||||
ENDPROC(aesni_xts_crypt8)
|
||||
|
||||
#endif
|
||||
|
@ -39,6 +39,9 @@
|
||||
#include <crypto/internal/aead.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/spinlock.h>
|
||||
#ifdef CONFIG_X86_64
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
|
||||
#define HAS_PCBC
|
||||
@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
|
||||
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
|
||||
const u8 *in, unsigned int len, u8 *iv);
|
||||
|
||||
asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
|
||||
const u8 *in, bool enc, u8 *iv);
|
||||
|
||||
/* asmlinkage void aesni_gcm_enc()
|
||||
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
|
||||
* u8 *out, Ciphertext output. Encrypt in-place is allowed.
|
||||
@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
|
||||
aesni_enc(ctx, out, in);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
|
||||
static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
|
||||
}
|
||||
|
||||
static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
|
||||
}
|
||||
|
||||
static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
|
||||
}
|
||||
|
||||
static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
|
||||
}
|
||||
|
||||
static const struct common_glue_ctx aesni_enc_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = 1,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx aesni_dec_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = 1,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(aesni_xts_tweak),
|
||||
aes_ctx(ctx->raw_tweak_ctx),
|
||||
aes_ctx(ctx->raw_crypt_ctx));
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(aesni_xts_tweak),
|
||||
aes_ctx(ctx->raw_tweak_ctx),
|
||||
aes_ctx(ctx->raw_crypt_ctx));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int rfc4106_init(struct crypto_tfm *tfm)
|
||||
{
|
||||
|
449
arch/x86/crypto/blowfish-avx2-asm_64.S
Normal file
449
arch/x86/crypto/blowfish-avx2-asm_64.S
Normal file
@ -0,0 +1,449 @@
|
||||
/*
|
||||
* x86_64/AVX2 assembler optimized version of Blowfish
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.file "blowfish-avx2-asm_64.S"
|
||||
|
||||
.data
|
||||
.align 32
|
||||
|
||||
.Lprefetch_mask:
|
||||
.long 0*64
|
||||
.long 1*64
|
||||
.long 2*64
|
||||
.long 3*64
|
||||
.long 4*64
|
||||
.long 5*64
|
||||
.long 6*64
|
||||
.long 7*64
|
||||
|
||||
.Lbswap32_mask:
|
||||
.long 0x00010203
|
||||
.long 0x04050607
|
||||
.long 0x08090a0b
|
||||
.long 0x0c0d0e0f
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lbswap_iv_mask:
|
||||
.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
.text
|
||||
/* structure of crypto context */
|
||||
#define p 0
|
||||
#define s0 ((16 + 2) * 4)
|
||||
#define s1 ((16 + 2 + (1 * 256)) * 4)
|
||||
#define s2 ((16 + 2 + (2 * 256)) * 4)
|
||||
#define s3 ((16 + 2 + (3 * 256)) * 4)
|
||||
|
||||
/* register macros */
|
||||
#define CTX %rdi
|
||||
#define RIO %rdx
|
||||
|
||||
#define RS0 %rax
|
||||
#define RS1 %r8
|
||||
#define RS2 %r9
|
||||
#define RS3 %r10
|
||||
|
||||
#define RLOOP %r11
|
||||
#define RLOOPd %r11d
|
||||
|
||||
#define RXr0 %ymm8
|
||||
#define RXr1 %ymm9
|
||||
#define RXr2 %ymm10
|
||||
#define RXr3 %ymm11
|
||||
#define RXl0 %ymm12
|
||||
#define RXl1 %ymm13
|
||||
#define RXl2 %ymm14
|
||||
#define RXl3 %ymm15
|
||||
|
||||
/* temp regs */
|
||||
#define RT0 %ymm0
|
||||
#define RT0x %xmm0
|
||||
#define RT1 %ymm1
|
||||
#define RT1x %xmm1
|
||||
#define RIDX0 %ymm2
|
||||
#define RIDX1 %ymm3
|
||||
#define RIDX1x %xmm3
|
||||
#define RIDX2 %ymm4
|
||||
#define RIDX3 %ymm5
|
||||
|
||||
/* vpgatherdd mask and '-1' */
|
||||
#define RNOT %ymm6
|
||||
|
||||
/* byte mask, (-1 >> 24) */
|
||||
#define RBYTE %ymm7
|
||||
|
||||
/***********************************************************************
|
||||
* 32-way AVX2 blowfish
|
||||
***********************************************************************/
|
||||
#define F(xl, xr) \
|
||||
vpsrld $24, xl, RIDX0; \
|
||||
vpsrld $16, xl, RIDX1; \
|
||||
vpsrld $8, xl, RIDX2; \
|
||||
vpand RBYTE, RIDX1, RIDX1; \
|
||||
vpand RBYTE, RIDX2, RIDX2; \
|
||||
vpand RBYTE, xl, RIDX3; \
|
||||
\
|
||||
vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpcmpeqd RIDX0, RIDX0, RIDX0; \
|
||||
\
|
||||
vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
|
||||
vpcmpeqd RIDX1, RIDX1, RIDX1; \
|
||||
vpaddd RT0, RT1, RT0; \
|
||||
\
|
||||
vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
|
||||
vpxor RT0, RT1, RT0; \
|
||||
\
|
||||
vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpaddd RT0, RT1, RT0; \
|
||||
\
|
||||
vpxor RT0, xr, xr;
|
||||
|
||||
#define add_roundkey(xl, nmem) \
|
||||
vpbroadcastd nmem, RT0; \
|
||||
vpxor RT0, xl ## 0, xl ## 0; \
|
||||
vpxor RT0, xl ## 1, xl ## 1; \
|
||||
vpxor RT0, xl ## 2, xl ## 2; \
|
||||
vpxor RT0, xl ## 3, xl ## 3;
|
||||
|
||||
#define round_enc() \
|
||||
add_roundkey(RXr, p(CTX,RLOOP,4)); \
|
||||
F(RXl0, RXr0); \
|
||||
F(RXl1, RXr1); \
|
||||
F(RXl2, RXr2); \
|
||||
F(RXl3, RXr3); \
|
||||
\
|
||||
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
|
||||
F(RXr0, RXl0); \
|
||||
F(RXr1, RXl1); \
|
||||
F(RXr2, RXl2); \
|
||||
F(RXr3, RXl3);
|
||||
|
||||
#define round_dec() \
|
||||
add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
|
||||
F(RXl0, RXr0); \
|
||||
F(RXl1, RXr1); \
|
||||
F(RXl2, RXr2); \
|
||||
F(RXl3, RXr3); \
|
||||
\
|
||||
add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
|
||||
F(RXr0, RXl0); \
|
||||
F(RXr1, RXl1); \
|
||||
F(RXr2, RXl2); \
|
||||
F(RXr3, RXl3);
|
||||
|
||||
#define init_round_constants() \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
leaq s0(CTX), RS0; \
|
||||
leaq s1(CTX), RS1; \
|
||||
leaq s2(CTX), RS2; \
|
||||
leaq s3(CTX), RS3; \
|
||||
vpsrld $24, RNOT, RBYTE;
|
||||
|
||||
#define transpose_2x2(x0, x1, t0) \
|
||||
vpunpckldq x0, x1, t0; \
|
||||
vpunpckhdq x0, x1, x1; \
|
||||
\
|
||||
vpunpcklqdq t0, x1, x0; \
|
||||
vpunpckhqdq t0, x1, x1;
|
||||
|
||||
#define read_block(xl, xr) \
|
||||
vbroadcasti128 .Lbswap32_mask, RT1; \
|
||||
\
|
||||
vpshufb RT1, xl ## 0, xl ## 0; \
|
||||
vpshufb RT1, xr ## 0, xr ## 0; \
|
||||
vpshufb RT1, xl ## 1, xl ## 1; \
|
||||
vpshufb RT1, xr ## 1, xr ## 1; \
|
||||
vpshufb RT1, xl ## 2, xl ## 2; \
|
||||
vpshufb RT1, xr ## 2, xr ## 2; \
|
||||
vpshufb RT1, xl ## 3, xl ## 3; \
|
||||
vpshufb RT1, xr ## 3, xr ## 3; \
|
||||
\
|
||||
transpose_2x2(xl ## 0, xr ## 0, RT0); \
|
||||
transpose_2x2(xl ## 1, xr ## 1, RT0); \
|
||||
transpose_2x2(xl ## 2, xr ## 2, RT0); \
|
||||
transpose_2x2(xl ## 3, xr ## 3, RT0);
|
||||
|
||||
#define write_block(xl, xr) \
|
||||
vbroadcasti128 .Lbswap32_mask, RT1; \
|
||||
\
|
||||
transpose_2x2(xl ## 0, xr ## 0, RT0); \
|
||||
transpose_2x2(xl ## 1, xr ## 1, RT0); \
|
||||
transpose_2x2(xl ## 2, xr ## 2, RT0); \
|
||||
transpose_2x2(xl ## 3, xr ## 3, RT0); \
|
||||
\
|
||||
vpshufb RT1, xl ## 0, xl ## 0; \
|
||||
vpshufb RT1, xr ## 0, xr ## 0; \
|
||||
vpshufb RT1, xl ## 1, xl ## 1; \
|
||||
vpshufb RT1, xr ## 1, xr ## 1; \
|
||||
vpshufb RT1, xl ## 2, xl ## 2; \
|
||||
vpshufb RT1, xr ## 2, xr ## 2; \
|
||||
vpshufb RT1, xl ## 3, xl ## 3; \
|
||||
vpshufb RT1, xr ## 3, xr ## 3;
|
||||
|
||||
.align 8
|
||||
__blowfish_enc_blk32:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RXl0..4, RXr0..4: plaintext
|
||||
* output:
|
||||
* RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
|
||||
*/
|
||||
init_round_constants();
|
||||
|
||||
read_block(RXl, RXr);
|
||||
|
||||
movl $1, RLOOPd;
|
||||
add_roundkey(RXl, p+4*(0)(CTX));
|
||||
|
||||
.align 4
|
||||
.L__enc_loop:
|
||||
round_enc();
|
||||
|
||||
leal 2(RLOOPd), RLOOPd;
|
||||
cmpl $17, RLOOPd;
|
||||
jne .L__enc_loop;
|
||||
|
||||
add_roundkey(RXr, p+4*(17)(CTX));
|
||||
|
||||
write_block(RXl, RXr);
|
||||
|
||||
ret;
|
||||
ENDPROC(__blowfish_enc_blk32)
|
||||
|
||||
.align 8
|
||||
__blowfish_dec_blk32:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RXl0..4, RXr0..4: ciphertext
|
||||
* output:
|
||||
* RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
|
||||
*/
|
||||
init_round_constants();
|
||||
|
||||
read_block(RXl, RXr);
|
||||
|
||||
movl $14, RLOOPd;
|
||||
add_roundkey(RXl, p+4*(17)(CTX));
|
||||
|
||||
.align 4
|
||||
.L__dec_loop:
|
||||
round_dec();
|
||||
|
||||
addl $-2, RLOOPd;
|
||||
jns .L__dec_loop;
|
||||
|
||||
add_roundkey(RXr, p+4*(0)(CTX));
|
||||
|
||||
write_block(RXl, RXr);
|
||||
|
||||
ret;
|
||||
ENDPROC(__blowfish_dec_blk32)
|
||||
|
||||
ENTRY(blowfish_ecb_enc_32way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
vmovdqu 0*32(%rdx), RXl0;
|
||||
vmovdqu 1*32(%rdx), RXr0;
|
||||
vmovdqu 2*32(%rdx), RXl1;
|
||||
vmovdqu 3*32(%rdx), RXr1;
|
||||
vmovdqu 4*32(%rdx), RXl2;
|
||||
vmovdqu 5*32(%rdx), RXr2;
|
||||
vmovdqu 6*32(%rdx), RXl3;
|
||||
vmovdqu 7*32(%rdx), RXr3;
|
||||
|
||||
call __blowfish_enc_blk32;
|
||||
|
||||
vmovdqu RXr0, 0*32(%rsi);
|
||||
vmovdqu RXl0, 1*32(%rsi);
|
||||
vmovdqu RXr1, 2*32(%rsi);
|
||||
vmovdqu RXl1, 3*32(%rsi);
|
||||
vmovdqu RXr2, 4*32(%rsi);
|
||||
vmovdqu RXl2, 5*32(%rsi);
|
||||
vmovdqu RXr3, 6*32(%rsi);
|
||||
vmovdqu RXl3, 7*32(%rsi);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(blowfish_ecb_enc_32way)
|
||||
|
||||
ENTRY(blowfish_ecb_dec_32way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
vmovdqu 0*32(%rdx), RXl0;
|
||||
vmovdqu 1*32(%rdx), RXr0;
|
||||
vmovdqu 2*32(%rdx), RXl1;
|
||||
vmovdqu 3*32(%rdx), RXr1;
|
||||
vmovdqu 4*32(%rdx), RXl2;
|
||||
vmovdqu 5*32(%rdx), RXr2;
|
||||
vmovdqu 6*32(%rdx), RXl3;
|
||||
vmovdqu 7*32(%rdx), RXr3;
|
||||
|
||||
call __blowfish_dec_blk32;
|
||||
|
||||
vmovdqu RXr0, 0*32(%rsi);
|
||||
vmovdqu RXl0, 1*32(%rsi);
|
||||
vmovdqu RXr1, 2*32(%rsi);
|
||||
vmovdqu RXl1, 3*32(%rsi);
|
||||
vmovdqu RXr2, 4*32(%rsi);
|
||||
vmovdqu RXl2, 5*32(%rsi);
|
||||
vmovdqu RXr3, 6*32(%rsi);
|
||||
vmovdqu RXl3, 7*32(%rsi);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(blowfish_ecb_dec_32way)
|
||||
|
||||
ENTRY(blowfish_cbc_dec_32way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
vmovdqu 0*32(%rdx), RXl0;
|
||||
vmovdqu 1*32(%rdx), RXr0;
|
||||
vmovdqu 2*32(%rdx), RXl1;
|
||||
vmovdqu 3*32(%rdx), RXr1;
|
||||
vmovdqu 4*32(%rdx), RXl2;
|
||||
vmovdqu 5*32(%rdx), RXr2;
|
||||
vmovdqu 6*32(%rdx), RXl3;
|
||||
vmovdqu 7*32(%rdx), RXr3;
|
||||
|
||||
call __blowfish_dec_blk32;
|
||||
|
||||
/* xor with src */
|
||||
vmovq (%rdx), RT0x;
|
||||
vpshufd $0x4f, RT0x, RT0x;
|
||||
vinserti128 $1, 8(%rdx), RT0, RT0;
|
||||
vpxor RT0, RXr0, RXr0;
|
||||
vpxor 0*32+24(%rdx), RXl0, RXl0;
|
||||
vpxor 1*32+24(%rdx), RXr1, RXr1;
|
||||
vpxor 2*32+24(%rdx), RXl1, RXl1;
|
||||
vpxor 3*32+24(%rdx), RXr2, RXr2;
|
||||
vpxor 4*32+24(%rdx), RXl2, RXl2;
|
||||
vpxor 5*32+24(%rdx), RXr3, RXr3;
|
||||
vpxor 6*32+24(%rdx), RXl3, RXl3;
|
||||
|
||||
vmovdqu RXr0, (0*32)(%rsi);
|
||||
vmovdqu RXl0, (1*32)(%rsi);
|
||||
vmovdqu RXr1, (2*32)(%rsi);
|
||||
vmovdqu RXl1, (3*32)(%rsi);
|
||||
vmovdqu RXr2, (4*32)(%rsi);
|
||||
vmovdqu RXl2, (5*32)(%rsi);
|
||||
vmovdqu RXr3, (6*32)(%rsi);
|
||||
vmovdqu RXl3, (7*32)(%rsi);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(blowfish_cbc_dec_32way)
|
||||
|
||||
ENTRY(blowfish_ctr_32way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (big endian, 64bit)
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
vpcmpeqd RT0, RT0, RT0;
|
||||
vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
|
||||
|
||||
vpcmpeqd RT1x, RT1x, RT1x;
|
||||
vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
|
||||
vpxor RIDX0, RIDX0, RIDX0;
|
||||
vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
|
||||
|
||||
vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
|
||||
|
||||
vpcmpeqd RT1, RT1, RT1;
|
||||
vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
|
||||
vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
|
||||
|
||||
vbroadcasti128 .Lbswap_iv_mask, RIDX0;
|
||||
vbroadcasti128 .Lbswap128_mask, RIDX1;
|
||||
|
||||
/* load IV and byteswap */
|
||||
vmovq (%rcx), RT1x;
|
||||
vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
|
||||
vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
|
||||
|
||||
/* construct IVs */
|
||||
vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */
|
||||
vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */
|
||||
vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */
|
||||
vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXl1;
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXr1;
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXl2;
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXr2;
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXl3;
|
||||
vpsubq RIDX2, RT1, RT1;
|
||||
vpshufb RIDX1, RT1, RXr3;
|
||||
|
||||
/* store last IV */
|
||||
vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
|
||||
vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
|
||||
vmovq RT1x, (%rcx);
|
||||
|
||||
call __blowfish_enc_blk32;
|
||||
|
||||
/* dst = src ^ iv */
|
||||
vpxor 0*32(%rdx), RXr0, RXr0;
|
||||
vpxor 1*32(%rdx), RXl0, RXl0;
|
||||
vpxor 2*32(%rdx), RXr1, RXr1;
|
||||
vpxor 3*32(%rdx), RXl1, RXl1;
|
||||
vpxor 4*32(%rdx), RXr2, RXr2;
|
||||
vpxor 5*32(%rdx), RXl2, RXl2;
|
||||
vpxor 6*32(%rdx), RXr3, RXr3;
|
||||
vpxor 7*32(%rdx), RXl3, RXl3;
|
||||
vmovdqu RXr0, (0*32)(%rsi);
|
||||
vmovdqu RXl0, (1*32)(%rsi);
|
||||
vmovdqu RXr1, (2*32)(%rsi);
|
||||
vmovdqu RXl1, (3*32)(%rsi);
|
||||
vmovdqu RXr2, (4*32)(%rsi);
|
||||
vmovdqu RXl2, (5*32)(%rsi);
|
||||
vmovdqu RXr3, (6*32)(%rsi);
|
||||
vmovdqu RXl3, (7*32)(%rsi);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(blowfish_ctr_32way)
|
585
arch/x86/crypto/blowfish_avx2_glue.c
Normal file
585
arch/x86/crypto/blowfish_avx2_glue.c
Normal file
@ -0,0 +1,585 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
|
||||
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
|
||||
* CTR part based on code (crypto/ctr.c) by:
|
||||
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/err.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/blowfish.h>
|
||||
#include <crypto/cryptd.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <asm/i387.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <asm/crypto/blowfish.h>
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <crypto/scatterwalk.h>
|
||||
|
||||
#define BF_AVX2_PARALLEL_BLOCKS 32
|
||||
|
||||
/* 32-way AVX2 parallel cipher functions */
|
||||
asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
|
||||
__be64 *iv);
|
||||
|
||||
static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
if (fpu_enabled)
|
||||
return true;
|
||||
|
||||
/* FPU is only used when chunk to be processed is large enough, so
|
||||
* do not enable FPU until it is necessary.
|
||||
*/
|
||||
if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
|
||||
return false;
|
||||
|
||||
kernel_fpu_begin();
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void bf_fpu_end(bool fpu_enabled)
|
||||
{
|
||||
if (fpu_enabled)
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
|
||||
bool enc)
|
||||
{
|
||||
bool fpu_enabled = false;
|
||||
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
const unsigned int bsize = BF_BLOCK_SIZE;
|
||||
unsigned int nbytes;
|
||||
int err;
|
||||
|
||||
err = blkcipher_walk_virt(desc, walk);
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
|
||||
while ((nbytes = walk->nbytes)) {
|
||||
u8 *wsrc = walk->src.virt.addr;
|
||||
u8 *wdst = walk->dst.virt.addr;
|
||||
|
||||
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
|
||||
|
||||
/* Process multi-block AVX2 batch */
|
||||
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
if (enc)
|
||||
blowfish_ecb_enc_32way(ctx, wdst, wsrc);
|
||||
else
|
||||
blowfish_ecb_dec_32way(ctx, wdst, wsrc);
|
||||
|
||||
wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
|
||||
wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
|
||||
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Process multi-block batch */
|
||||
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
if (enc)
|
||||
blowfish_enc_blk_4way(ctx, wdst, wsrc);
|
||||
else
|
||||
blowfish_dec_blk_4way(ctx, wdst, wsrc);
|
||||
|
||||
wsrc += bsize * BF_PARALLEL_BLOCKS;
|
||||
wdst += bsize * BF_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * BF_PARALLEL_BLOCKS;
|
||||
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Handle leftovers */
|
||||
do {
|
||||
if (enc)
|
||||
blowfish_enc_blk(ctx, wdst, wsrc);
|
||||
else
|
||||
blowfish_dec_blk(ctx, wdst, wsrc);
|
||||
|
||||
wsrc += bsize;
|
||||
wdst += bsize;
|
||||
nbytes -= bsize;
|
||||
} while (nbytes >= bsize);
|
||||
|
||||
done:
|
||||
err = blkcipher_walk_done(desc, walk, nbytes);
|
||||
}
|
||||
|
||||
bf_fpu_end(fpu_enabled);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct blkcipher_walk walk;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
return ecb_crypt(desc, &walk, true);
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct blkcipher_walk walk;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
return ecb_crypt(desc, &walk, false);
|
||||
}
|
||||
|
||||
static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
|
||||
struct blkcipher_walk *walk)
|
||||
{
|
||||
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
unsigned int bsize = BF_BLOCK_SIZE;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u64 *src = (u64 *)walk->src.virt.addr;
|
||||
u64 *dst = (u64 *)walk->dst.virt.addr;
|
||||
u64 *iv = (u64 *)walk->iv;
|
||||
|
||||
do {
|
||||
*dst = *src ^ *iv;
|
||||
blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
|
||||
iv = dst;
|
||||
|
||||
src += 1;
|
||||
dst += 1;
|
||||
nbytes -= bsize;
|
||||
} while (nbytes >= bsize);
|
||||
|
||||
*(u64 *)walk->iv = *iv;
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct blkcipher_walk walk;
|
||||
int err;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
|
||||
while ((nbytes = walk.nbytes)) {
|
||||
nbytes = __cbc_encrypt(desc, &walk);
|
||||
err = blkcipher_walk_done(desc, &walk, nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
|
||||
struct blkcipher_walk *walk)
|
||||
{
|
||||
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
const unsigned int bsize = BF_BLOCK_SIZE;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u64 *src = (u64 *)walk->src.virt.addr;
|
||||
u64 *dst = (u64 *)walk->dst.virt.addr;
|
||||
u64 last_iv;
|
||||
int i;
|
||||
|
||||
/* Start of the last block. */
|
||||
src += nbytes / bsize - 1;
|
||||
dst += nbytes / bsize - 1;
|
||||
|
||||
last_iv = *src;
|
||||
|
||||
/* Process multi-block AVX2 batch */
|
||||
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
|
||||
src -= BF_AVX2_PARALLEL_BLOCKS - 1;
|
||||
dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
|
||||
|
||||
blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
nbytes -= bsize;
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
|
||||
*dst ^= *(src - 1);
|
||||
src -= 1;
|
||||
dst -= 1;
|
||||
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Process multi-block batch */
|
||||
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
|
||||
u64 ivs[BF_PARALLEL_BLOCKS - 1];
|
||||
|
||||
do {
|
||||
nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
|
||||
src -= BF_PARALLEL_BLOCKS - 1;
|
||||
dst -= BF_PARALLEL_BLOCKS - 1;
|
||||
|
||||
for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
|
||||
ivs[i] = src[i];
|
||||
|
||||
blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
|
||||
dst[i + 1] ^= ivs[i];
|
||||
|
||||
nbytes -= bsize;
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
|
||||
*dst ^= *(src - 1);
|
||||
src -= 1;
|
||||
dst -= 1;
|
||||
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Handle leftovers */
|
||||
for (;;) {
|
||||
blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
nbytes -= bsize;
|
||||
if (nbytes < bsize)
|
||||
break;
|
||||
|
||||
*dst ^= *(src - 1);
|
||||
src -= 1;
|
||||
dst -= 1;
|
||||
}
|
||||
|
||||
done:
|
||||
*dst ^= *(u64 *)walk->iv;
|
||||
*(u64 *)walk->iv = last_iv;
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
bool fpu_enabled = false;
|
||||
struct blkcipher_walk walk;
|
||||
int err;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
|
||||
while ((nbytes = walk.nbytes)) {
|
||||
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
|
||||
nbytes = __cbc_decrypt(desc, &walk);
|
||||
err = blkcipher_walk_done(desc, &walk, nbytes);
|
||||
}
|
||||
|
||||
bf_fpu_end(fpu_enabled);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ctr_crypt_final(struct blkcipher_desc *desc,
|
||||
struct blkcipher_walk *walk)
|
||||
{
|
||||
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
u8 *ctrblk = walk->iv;
|
||||
u8 keystream[BF_BLOCK_SIZE];
|
||||
u8 *src = walk->src.virt.addr;
|
||||
u8 *dst = walk->dst.virt.addr;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
|
||||
blowfish_enc_blk(ctx, keystream, ctrblk);
|
||||
crypto_xor(keystream, src, nbytes);
|
||||
memcpy(dst, keystream, nbytes);
|
||||
|
||||
crypto_inc(ctrblk, BF_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
|
||||
struct blkcipher_walk *walk)
|
||||
{
|
||||
struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
unsigned int bsize = BF_BLOCK_SIZE;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u64 *src = (u64 *)walk->src.virt.addr;
|
||||
u64 *dst = (u64 *)walk->dst.virt.addr;
|
||||
int i;
|
||||
|
||||
/* Process multi-block AVX2 batch */
|
||||
if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
|
||||
(__be64 *)walk->iv);
|
||||
|
||||
src += BF_AVX2_PARALLEL_BLOCKS;
|
||||
dst += BF_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
|
||||
} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Process four block batch */
|
||||
if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
|
||||
__be64 ctrblocks[BF_PARALLEL_BLOCKS];
|
||||
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
|
||||
|
||||
do {
|
||||
/* create ctrblks for parallel encrypt */
|
||||
for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
ctrblocks[i] = cpu_to_be64(ctrblk++);
|
||||
}
|
||||
|
||||
blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
|
||||
(u8 *)ctrblocks);
|
||||
|
||||
src += BF_PARALLEL_BLOCKS;
|
||||
dst += BF_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * BF_PARALLEL_BLOCKS;
|
||||
} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
|
||||
|
||||
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Handle leftovers */
|
||||
do {
|
||||
u64 ctrblk;
|
||||
|
||||
if (dst != src)
|
||||
*dst = *src;
|
||||
|
||||
ctrblk = *(u64 *)walk->iv;
|
||||
be64_add_cpu((__be64 *)walk->iv, 1);
|
||||
|
||||
blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
|
||||
|
||||
src += 1;
|
||||
dst += 1;
|
||||
} while ((nbytes -= bsize) >= bsize);
|
||||
|
||||
done:
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
bool fpu_enabled = false;
|
||||
struct blkcipher_walk walk;
|
||||
int err;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
|
||||
while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
|
||||
fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
|
||||
nbytes = __ctr_crypt(desc, &walk);
|
||||
err = blkcipher_walk_done(desc, &walk, nbytes);
|
||||
}
|
||||
|
||||
bf_fpu_end(fpu_enabled);
|
||||
|
||||
if (walk.nbytes) {
|
||||
ctr_crypt_final(desc, &walk);
|
||||
err = blkcipher_walk_done(desc, &walk, 0);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct crypto_alg bf_algs[6] = { {
|
||||
.cra_name = "__ecb-blowfish-avx2",
|
||||
.cra_driver_name = "__driver-ecb-blowfish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = BF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct bf_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.setkey = blowfish_setkey,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-blowfish-avx2",
|
||||
.cra_driver_name = "__driver-cbc-blowfish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = BF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct bf_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.setkey = blowfish_setkey,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-blowfish-avx2",
|
||||
.cra_driver_name = "__driver-ctr-blowfish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct bf_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.ivsize = BF_BLOCK_SIZE,
|
||||
.setkey = blowfish_setkey,
|
||||
.encrypt = ctr_crypt,
|
||||
.decrypt = ctr_crypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(blowfish)",
|
||||
.cra_driver_name = "ecb-blowfish-avx2",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = BF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "cbc(blowfish)",
|
||||
.cra_driver_name = "cbc-blowfish-avx2",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = BF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.ivsize = BF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = __ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ctr(blowfish)",
|
||||
.cra_driver_name = "ctr-blowfish-avx2",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = BF_MIN_KEY_SIZE,
|
||||
.max_keysize = BF_MAX_KEY_SIZE,
|
||||
.ivsize = BF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_encrypt,
|
||||
.geniv = "chainiv",
|
||||
},
|
||||
},
|
||||
} };
|
||||
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx2 || !cpu_has_osxsave) {
|
||||
pr_info("AVX2 instructions are not detected.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX detected but unusable.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
|
||||
}
|
||||
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
|
||||
MODULE_ALIAS("blowfish");
|
||||
MODULE_ALIAS("blowfish-asm");
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Glue Code for assembler optimized version of Blowfish
|
||||
*
|
||||
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
|
||||
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
|
||||
@ -32,40 +32,24 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <asm/crypto/blowfish.h>
|
||||
|
||||
/* regular block cipher functions */
|
||||
asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
|
||||
bool xor);
|
||||
EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
|
||||
|
||||
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(blowfish_dec_blk);
|
||||
|
||||
/* 4-way parallel cipher functions */
|
||||
asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
|
||||
|
||||
asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk_4way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk_4way(ctx, dst, src, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
|
||||
|
||||
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
|
||||
{
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* x86_64/AVX/AES-NI assembler implementation of Camellia
|
||||
*
|
||||
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
/* For XTS mode IV generation */
|
||||
.Lxts_gf128mul_and_shl1_mask:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
/*
|
||||
* pre-SubByte transform
|
||||
*
|
||||
@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
|
||||
|
||||
ret;
|
||||
ENDPROC(camellia_ctr_16way)
|
||||
|
||||
#define gf128mul_x_ble(iv, mask, tmp) \
|
||||
vpsrad $31, iv, tmp; \
|
||||
vpaddq iv, iv, iv; \
|
||||
vpshufd $0x13, tmp, tmp; \
|
||||
vpand mask, tmp, tmp; \
|
||||
vpxor tmp, iv, iv;
|
||||
|
||||
.align 8
|
||||
camellia_xts_crypt_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
* %r8: index for input whitening key
|
||||
* %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
|
||||
*/
|
||||
|
||||
subq $(16 * 16), %rsp;
|
||||
movq %rsp, %rax;
|
||||
|
||||
vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
|
||||
|
||||
/* load IV */
|
||||
vmovdqu (%rcx), %xmm0;
|
||||
vpxor 0 * 16(%rdx), %xmm0, %xmm15;
|
||||
vmovdqu %xmm15, 15 * 16(%rax);
|
||||
vmovdqu %xmm0, 0 * 16(%rsi);
|
||||
|
||||
/* construct IVs */
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 1 * 16(%rdx), %xmm0, %xmm15;
|
||||
vmovdqu %xmm15, 14 * 16(%rax);
|
||||
vmovdqu %xmm0, 1 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 2 * 16(%rdx), %xmm0, %xmm13;
|
||||
vmovdqu %xmm0, 2 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 3 * 16(%rdx), %xmm0, %xmm12;
|
||||
vmovdqu %xmm0, 3 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 4 * 16(%rdx), %xmm0, %xmm11;
|
||||
vmovdqu %xmm0, 4 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 5 * 16(%rdx), %xmm0, %xmm10;
|
||||
vmovdqu %xmm0, 5 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 6 * 16(%rdx), %xmm0, %xmm9;
|
||||
vmovdqu %xmm0, 6 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 7 * 16(%rdx), %xmm0, %xmm8;
|
||||
vmovdqu %xmm0, 7 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 8 * 16(%rdx), %xmm0, %xmm7;
|
||||
vmovdqu %xmm0, 8 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 9 * 16(%rdx), %xmm0, %xmm6;
|
||||
vmovdqu %xmm0, 9 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 10 * 16(%rdx), %xmm0, %xmm5;
|
||||
vmovdqu %xmm0, 10 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 11 * 16(%rdx), %xmm0, %xmm4;
|
||||
vmovdqu %xmm0, 11 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 12 * 16(%rdx), %xmm0, %xmm3;
|
||||
vmovdqu %xmm0, 12 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 13 * 16(%rdx), %xmm0, %xmm2;
|
||||
vmovdqu %xmm0, 13 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 14 * 16(%rdx), %xmm0, %xmm1;
|
||||
vmovdqu %xmm0, 14 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vpxor 15 * 16(%rdx), %xmm0, %xmm15;
|
||||
vmovdqu %xmm15, 0 * 16(%rax);
|
||||
vmovdqu %xmm0, 15 * 16(%rsi);
|
||||
|
||||
gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
||||
vmovdqu %xmm0, (%rcx);
|
||||
|
||||
/* inpack16_pre: */
|
||||
vmovq (key_table)(CTX, %r8, 8), %xmm15;
|
||||
vpshufb .Lpack_bswap, %xmm15, %xmm15;
|
||||
vpxor 0 * 16(%rax), %xmm15, %xmm0;
|
||||
vpxor %xmm1, %xmm15, %xmm1;
|
||||
vpxor %xmm2, %xmm15, %xmm2;
|
||||
vpxor %xmm3, %xmm15, %xmm3;
|
||||
vpxor %xmm4, %xmm15, %xmm4;
|
||||
vpxor %xmm5, %xmm15, %xmm5;
|
||||
vpxor %xmm6, %xmm15, %xmm6;
|
||||
vpxor %xmm7, %xmm15, %xmm7;
|
||||
vpxor %xmm8, %xmm15, %xmm8;
|
||||
vpxor %xmm9, %xmm15, %xmm9;
|
||||
vpxor %xmm10, %xmm15, %xmm10;
|
||||
vpxor %xmm11, %xmm15, %xmm11;
|
||||
vpxor %xmm12, %xmm15, %xmm12;
|
||||
vpxor %xmm13, %xmm15, %xmm13;
|
||||
vpxor 14 * 16(%rax), %xmm15, %xmm14;
|
||||
vpxor 15 * 16(%rax), %xmm15, %xmm15;
|
||||
|
||||
call *%r9;
|
||||
|
||||
addq $(16 * 16), %rsp;
|
||||
|
||||
vpxor 0 * 16(%rsi), %xmm7, %xmm7;
|
||||
vpxor 1 * 16(%rsi), %xmm6, %xmm6;
|
||||
vpxor 2 * 16(%rsi), %xmm5, %xmm5;
|
||||
vpxor 3 * 16(%rsi), %xmm4, %xmm4;
|
||||
vpxor 4 * 16(%rsi), %xmm3, %xmm3;
|
||||
vpxor 5 * 16(%rsi), %xmm2, %xmm2;
|
||||
vpxor 6 * 16(%rsi), %xmm1, %xmm1;
|
||||
vpxor 7 * 16(%rsi), %xmm0, %xmm0;
|
||||
vpxor 8 * 16(%rsi), %xmm15, %xmm15;
|
||||
vpxor 9 * 16(%rsi), %xmm14, %xmm14;
|
||||
vpxor 10 * 16(%rsi), %xmm13, %xmm13;
|
||||
vpxor 11 * 16(%rsi), %xmm12, %xmm12;
|
||||
vpxor 12 * 16(%rsi), %xmm11, %xmm11;
|
||||
vpxor 13 * 16(%rsi), %xmm10, %xmm10;
|
||||
vpxor 14 * 16(%rsi), %xmm9, %xmm9;
|
||||
vpxor 15 * 16(%rsi), %xmm8, %xmm8;
|
||||
write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
|
||||
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
|
||||
%xmm8, %rsi);
|
||||
|
||||
ret;
|
||||
ENDPROC(camellia_xts_crypt_16way)
|
||||
|
||||
ENTRY(camellia_xts_enc_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
xorl %r8d, %r8d; /* input whitening key, 0 for enc */
|
||||
|
||||
leaq __camellia_enc_blk16, %r9;
|
||||
|
||||
jmp camellia_xts_crypt_16way;
|
||||
ENDPROC(camellia_xts_enc_16way)
|
||||
|
||||
ENTRY(camellia_xts_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
cmpl $16, key_length(CTX);
|
||||
movl $32, %r8d;
|
||||
movl $24, %eax;
|
||||
cmovel %eax, %r8d; /* input whitening key, last for dec */
|
||||
|
||||
leaq __camellia_dec_blk16, %r9;
|
||||
|
||||
jmp camellia_xts_crypt_16way;
|
||||
ENDPROC(camellia_xts_dec_16way)
|
||||
|
1368
arch/x86/crypto/camellia-aesni-avx2-asm_64.S
Normal file
1368
arch/x86/crypto/camellia-aesni-avx2-asm_64.S
Normal file
File diff suppressed because it is too large
Load Diff
586
arch/x86/crypto/camellia_aesni_avx2_glue.c
Normal file
586
arch/x86/crypto/camellia_aesni_avx2_glue.c
Normal file
@ -0,0 +1,586 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
|
||||
*
|
||||
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/err.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <crypto/lrw.h>
|
||||
#include <crypto/xts.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <asm/crypto/camellia.h>
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
|
||||
#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
|
||||
|
||||
/* 32-way AVX2/AES-NI parallel cipher functions */
|
||||
asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static const struct common_glue_ctx camellia_enc = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_ctr = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_enc_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec_cbc = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
|
||||
}, {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
|
||||
dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
|
||||
CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static inline void camellia_fpu_end(bool fpu_enabled)
|
||||
{
|
||||
glue_fpu_end(fpu_enabled);
|
||||
}
|
||||
|
||||
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
{
|
||||
return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
|
||||
&tfm->crt_flags);
|
||||
}
|
||||
|
||||
struct crypt_priv {
|
||||
struct camellia_ctx *ctx;
|
||||
bool fpu_enabled;
|
||||
};
|
||||
|
||||
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
camellia_enc_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
camellia_dec_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->camellia_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->camellia_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg cmll_algs[10] = { {
|
||||
.cra_name = "__ecb-camellia-aesni-avx2",
|
||||
.cra_driver_name = "__driver-ecb-camellia-aesni-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-camellia-aesni-avx2",
|
||||
.cra_driver_name = "__driver-cbc-camellia-aesni-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-camellia-aesni-avx2",
|
||||
.cra_driver_name = "__driver-ctr-camellia-aesni-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = ctr_crypt,
|
||||
.decrypt = ctr_crypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__lrw-camellia-aesni-avx2",
|
||||
.cra_driver_name = "__driver-lrw-camellia-aesni-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_lrw_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_exit = lrw_camellia_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = lrw_camellia_setkey,
|
||||
.encrypt = lrw_encrypt,
|
||||
.decrypt = lrw_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__xts-camellia-aesni-avx2",
|
||||
.cra_driver_name = "__driver-xts-camellia-aesni-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_xts_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = xts_camellia_setkey,
|
||||
.encrypt = xts_encrypt,
|
||||
.decrypt = xts_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(camellia)",
|
||||
.cra_driver_name = "ecb-camellia-aesni-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "cbc(camellia)",
|
||||
.cra_driver_name = "cbc-camellia-aesni-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = __ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ctr(camellia)",
|
||||
.cra_driver_name = "ctr-camellia-aesni-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_encrypt,
|
||||
.geniv = "chainiv",
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "lrw(camellia)",
|
||||
.cra_driver_name = "lrw-camellia-aesni-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "xts(camellia)",
|
||||
.cra_driver_name = "xts-camellia-aesni-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
} };
|
||||
|
||||
static int __init camellia_aesni_init(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
|
||||
pr_info("AVX2 or AES-NI instructions are not detected.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX2 detected but unusable.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
|
||||
}
|
||||
|
||||
static void __exit camellia_aesni_fini(void)
|
||||
{
|
||||
crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
|
||||
}
|
||||
|
||||
module_init(camellia_aesni_init);
|
||||
module_exit(camellia_aesni_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
|
||||
MODULE_ALIAS("camellia");
|
||||
MODULE_ALIAS("camellia-asm");
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
|
||||
*
|
||||
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -26,16 +26,44 @@
|
||||
|
||||
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
|
||||
|
||||
/* 16-way AES-NI parallel cipher functions */
|
||||
/* 16-way parallel cipher functions (avx/aes-ni) */
|
||||
asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
|
||||
|
||||
asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
|
||||
|
||||
asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
|
||||
|
||||
asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(camellia_ctr_16way);
|
||||
|
||||
asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
|
||||
|
||||
asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
|
||||
|
||||
void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(camellia_enc_blk));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_xts_enc);
|
||||
|
||||
void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(camellia_dec_blk));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_xts_dec);
|
||||
|
||||
static const struct common_glue_ctx camellia_enc = {
|
||||
.num_funcs = 3,
|
||||
@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_enc_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg cmll_algs[10] = { {
|
||||
|
@ -4,7 +4,7 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -227,6 +227,8 @@
|
||||
.data
|
||||
|
||||
.align 16
|
||||
.Lxts_gf128mul_and_shl1_mask:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
.Lbswap_mask:
|
||||
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
.Lbswap128_mask:
|
||||
@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way)
|
||||
|
||||
ret;
|
||||
ENDPROC(cast6_ctr_8way)
|
||||
|
||||
ENTRY(cast6_xts_enc_8way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
|
||||
RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __cast6_enc_blk8;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
ENDPROC(cast6_xts_enc_8way)
|
||||
|
||||
ENTRY(cast6_xts_dec_8way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
|
||||
RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __cast6_dec_blk8;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
ENDPROC(cast6_xts_dec_8way)
|
||||
|
@ -4,6 +4,8 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
|
||||
le128 *iv);
|
||||
|
||||
asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(__cast6_encrypt));
|
||||
}
|
||||
|
||||
static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(__cast6_decrypt));
|
||||
}
|
||||
|
||||
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx cast6_enc_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx cast6_dec = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
|
||||
@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx cast6_dec_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAST6_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
cast6_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__cast6_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAST6_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
cast6_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__cast6_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg cast6_algs[10] = { {
|
||||
|
@ -101,9 +101,8 @@
|
||||
* uint crc32_pclmul_le_16(unsigned char const *buffer,
|
||||
* size_t len, uint crc32)
|
||||
*/
|
||||
.globl crc32_pclmul_le_16
|
||||
.align 4, 0x90
|
||||
crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
|
||||
|
||||
ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
|
||||
movdqa (BUF), %xmm1
|
||||
movdqa 0x10(BUF), %xmm2
|
||||
movdqa 0x20(BUF), %xmm3
|
||||
@ -244,3 +243,4 @@ fold_64:
|
||||
pextrd $0x01, %xmm1, %eax
|
||||
|
||||
ret
|
||||
ENDPROC(crc32_pclmul_le_16)
|
||||
|
@ -1,9 +1,10 @@
|
||||
/*
|
||||
* Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
|
||||
*
|
||||
* The white paper on CRC32C calculations with PCLMULQDQ instruction can be
|
||||
* The white papers on CRC32C calculations with PCLMULQDQ instruction can be
|
||||
* downloaded from:
|
||||
* http://download.intel.com/design/intarch/papers/323405.pdf
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
*
|
||||
@ -42,6 +43,7 @@
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <asm/inst.h>
|
||||
#include <linux/linkage.h>
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
@ -225,10 +227,10 @@ LABEL crc_ %i
|
||||
movdqa (bufp), %xmm0 # 2 consts: K1:K2
|
||||
|
||||
movq crc_init, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2
|
||||
PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
|
||||
|
||||
movq crc1, %xmm2 # CRC for block 2
|
||||
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
|
||||
PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
|
||||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Shared glue code for 128bit block ciphers, AVX assembler macros
|
||||
*
|
||||
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -89,3 +89,62 @@
|
||||
vpxor (6*16)(src), x6, x6; \
|
||||
vpxor (7*16)(src), x7, x7; \
|
||||
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
||||
#define gf128mul_x_ble(iv, mask, tmp) \
|
||||
vpsrad $31, iv, tmp; \
|
||||
vpaddq iv, iv, iv; \
|
||||
vpshufd $0x13, tmp, tmp; \
|
||||
vpand mask, tmp, tmp; \
|
||||
vpxor tmp, iv, iv;
|
||||
|
||||
#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
|
||||
t1, xts_gf128mul_and_shl1_mask) \
|
||||
vmovdqa xts_gf128mul_and_shl1_mask, t0; \
|
||||
\
|
||||
/* load IV */ \
|
||||
vmovdqu (iv), tiv; \
|
||||
vpxor (0*16)(src), tiv, x0; \
|
||||
vmovdqu tiv, (0*16)(dst); \
|
||||
\
|
||||
/* construct and store IVs, also xor with source */ \
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (1*16)(src), tiv, x1; \
|
||||
vmovdqu tiv, (1*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (2*16)(src), tiv, x2; \
|
||||
vmovdqu tiv, (2*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (3*16)(src), tiv, x3; \
|
||||
vmovdqu tiv, (3*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (4*16)(src), tiv, x4; \
|
||||
vmovdqu tiv, (4*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (5*16)(src), tiv, x5; \
|
||||
vmovdqu tiv, (5*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (6*16)(src), tiv, x6; \
|
||||
vmovdqu tiv, (6*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vpxor (7*16)(src), tiv, x7; \
|
||||
vmovdqu tiv, (7*16)(dst); \
|
||||
\
|
||||
gf128mul_x_ble(tiv, t0, t1); \
|
||||
vmovdqu tiv, (iv);
|
||||
|
||||
#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vpxor (0*16)(dst), x0, x0; \
|
||||
vpxor (1*16)(dst), x1, x1; \
|
||||
vpxor (2*16)(dst), x2, x2; \
|
||||
vpxor (3*16)(dst), x3, x3; \
|
||||
vpxor (4*16)(dst), x4, x4; \
|
||||
vpxor (5*16)(dst), x5, x5; \
|
||||
vpxor (6*16)(dst), x6, x6; \
|
||||
vpxor (7*16)(dst), x7, x7; \
|
||||
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
180
arch/x86/crypto/glue_helper-asm-avx2.S
Normal file
180
arch/x86/crypto/glue_helper-asm-avx2.S
Normal file
@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Shared glue code for 128bit block ciphers, AVX2 assembler macros
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vmovdqu (0*32)(src), x0; \
|
||||
vmovdqu (1*32)(src), x1; \
|
||||
vmovdqu (2*32)(src), x2; \
|
||||
vmovdqu (3*32)(src), x3; \
|
||||
vmovdqu (4*32)(src), x4; \
|
||||
vmovdqu (5*32)(src), x5; \
|
||||
vmovdqu (6*32)(src), x6; \
|
||||
vmovdqu (7*32)(src), x7;
|
||||
|
||||
#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vmovdqu x0, (0*32)(dst); \
|
||||
vmovdqu x1, (1*32)(dst); \
|
||||
vmovdqu x2, (2*32)(dst); \
|
||||
vmovdqu x3, (3*32)(dst); \
|
||||
vmovdqu x4, (4*32)(dst); \
|
||||
vmovdqu x5, (5*32)(dst); \
|
||||
vmovdqu x6, (6*32)(dst); \
|
||||
vmovdqu x7, (7*32)(dst);
|
||||
|
||||
#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
|
||||
vpxor t0, t0, t0; \
|
||||
vinserti128 $1, (src), t0, t0; \
|
||||
vpxor t0, x0, x0; \
|
||||
vpxor (0*32+16)(src), x1, x1; \
|
||||
vpxor (1*32+16)(src), x2, x2; \
|
||||
vpxor (2*32+16)(src), x3, x3; \
|
||||
vpxor (3*32+16)(src), x4, x4; \
|
||||
vpxor (4*32+16)(src), x5, x5; \
|
||||
vpxor (5*32+16)(src), x6, x6; \
|
||||
vpxor (6*32+16)(src), x7, x7; \
|
||||
store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
||||
#define inc_le128(x, minus_one, tmp) \
|
||||
vpcmpeqq minus_one, x, tmp; \
|
||||
vpsubq minus_one, x, x; \
|
||||
vpslldq $8, tmp, tmp; \
|
||||
vpsubq tmp, x, x;
|
||||
|
||||
#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
|
||||
vpcmpeqq minus_one, x, tmp1; \
|
||||
vpcmpeqq minus_two, x, tmp2; \
|
||||
vpsubq minus_two, x, x; \
|
||||
vpor tmp2, tmp1, tmp1; \
|
||||
vpslldq $8, tmp1, tmp1; \
|
||||
vpsubq tmp1, x, x;
|
||||
|
||||
#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
|
||||
t1x, t2, t2x, t3, t3x, t4, t5) \
|
||||
vpcmpeqd t0, t0, t0; \
|
||||
vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
|
||||
vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
|
||||
\
|
||||
/* load IV and byteswap */ \
|
||||
vmovdqu (iv), t2x; \
|
||||
vmovdqa t2x, t3x; \
|
||||
inc_le128(t2x, t0x, t1x); \
|
||||
vbroadcasti128 bswap, t1; \
|
||||
vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
|
||||
vpshufb t1, t2, x0; \
|
||||
\
|
||||
/* construct IVs */ \
|
||||
add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
|
||||
vpshufb t1, t2, x1; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x2; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x3; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x4; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x5; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x6; \
|
||||
add2_le128(t2, t0, t4, t3, t5); \
|
||||
vpshufb t1, t2, x7; \
|
||||
vextracti128 $1, t2, t2x; \
|
||||
inc_le128(t2x, t0x, t3x); \
|
||||
vmovdqu t2x, (iv);
|
||||
|
||||
#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vpxor (0*32)(src), x0, x0; \
|
||||
vpxor (1*32)(src), x1, x1; \
|
||||
vpxor (2*32)(src), x2, x2; \
|
||||
vpxor (3*32)(src), x3, x3; \
|
||||
vpxor (4*32)(src), x4, x4; \
|
||||
vpxor (5*32)(src), x5, x5; \
|
||||
vpxor (6*32)(src), x6, x6; \
|
||||
vpxor (7*32)(src), x7, x7; \
|
||||
store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
||||
#define gf128mul_x_ble(iv, mask, tmp) \
|
||||
vpsrad $31, iv, tmp; \
|
||||
vpaddq iv, iv, iv; \
|
||||
vpshufd $0x13, tmp, tmp; \
|
||||
vpand mask, tmp, tmp; \
|
||||
vpxor tmp, iv, iv;
|
||||
|
||||
#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
|
||||
vpsrad $31, iv, tmp0; \
|
||||
vpaddq iv, iv, tmp1; \
|
||||
vpsllq $2, iv, iv; \
|
||||
vpshufd $0x13, tmp0, tmp0; \
|
||||
vpsrad $31, tmp1, tmp1; \
|
||||
vpand mask2, tmp0, tmp0; \
|
||||
vpshufd $0x13, tmp1, tmp1; \
|
||||
vpxor tmp0, iv, iv; \
|
||||
vpand mask1, tmp1, tmp1; \
|
||||
vpxor tmp1, iv, iv;
|
||||
|
||||
#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
|
||||
tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
|
||||
xts_gf128mul_and_shl1_mask_0, \
|
||||
xts_gf128mul_and_shl1_mask_1) \
|
||||
vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
|
||||
\
|
||||
/* load IV and construct second IV */ \
|
||||
vmovdqu (iv), tivx; \
|
||||
vmovdqa tivx, t0x; \
|
||||
gf128mul_x_ble(tivx, t1x, t2x); \
|
||||
vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
|
||||
vinserti128 $1, tivx, t0, tiv; \
|
||||
vpxor (0*32)(src), tiv, x0; \
|
||||
vmovdqu tiv, (0*32)(dst); \
|
||||
\
|
||||
/* construct and store IVs, also xor with source */ \
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (1*32)(src), tiv, x1; \
|
||||
vmovdqu tiv, (1*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (2*32)(src), tiv, x2; \
|
||||
vmovdqu tiv, (2*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (3*32)(src), tiv, x3; \
|
||||
vmovdqu tiv, (3*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (4*32)(src), tiv, x4; \
|
||||
vmovdqu tiv, (4*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (5*32)(src), tiv, x5; \
|
||||
vmovdqu tiv, (5*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (6*32)(src), tiv, x6; \
|
||||
vmovdqu tiv, (6*32)(dst); \
|
||||
\
|
||||
gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
|
||||
vpxor (7*32)(src), tiv, x7; \
|
||||
vmovdqu tiv, (7*32)(dst); \
|
||||
\
|
||||
vextracti128 $1, tiv, tivx; \
|
||||
gf128mul_x_ble(tivx, t1x, t2x); \
|
||||
vmovdqu tivx, (iv);
|
||||
|
||||
#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vpxor (0*32)(dst), x0, x0; \
|
||||
vpxor (1*32)(dst), x1, x1; \
|
||||
vpxor (2*32)(dst), x2, x2; \
|
||||
vpxor (3*32)(dst), x3, x3; \
|
||||
vpxor (4*32)(dst), x4, x4; \
|
||||
vpxor (5*32)(dst), x5, x5; \
|
||||
vpxor (6*32)(dst), x6, x6; \
|
||||
vpxor (7*32)(dst), x7, x7; \
|
||||
store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Shared glue code for 128bit block ciphers
|
||||
*
|
||||
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
|
||||
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
|
||||
@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
|
||||
|
||||
static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
void *ctx,
|
||||
struct blkcipher_desc *desc,
|
||||
struct blkcipher_walk *walk)
|
||||
{
|
||||
const unsigned int bsize = 128 / 8;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u128 *src = (u128 *)walk->src.virt.addr;
|
||||
u128 *dst = (u128 *)walk->dst.virt.addr;
|
||||
unsigned int num_blocks, func_bytes;
|
||||
unsigned int i;
|
||||
|
||||
/* Process multi-block batch */
|
||||
for (i = 0; i < gctx->num_funcs; i++) {
|
||||
num_blocks = gctx->funcs[i].num_blocks;
|
||||
func_bytes = bsize * num_blocks;
|
||||
|
||||
if (nbytes >= func_bytes) {
|
||||
do {
|
||||
gctx->funcs[i].fn_u.xts(ctx, dst, src,
|
||||
(le128 *)walk->iv);
|
||||
|
||||
src += num_blocks;
|
||||
dst += num_blocks;
|
||||
nbytes -= func_bytes;
|
||||
} while (nbytes >= func_bytes);
|
||||
|
||||
if (nbytes < bsize)
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
/* for implementations implementing faster XTS IV generator */
|
||||
int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes,
|
||||
void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src),
|
||||
void *tweak_ctx, void *crypt_ctx)
|
||||
{
|
||||
const unsigned int bsize = 128 / 8;
|
||||
bool fpu_enabled = false;
|
||||
struct blkcipher_walk walk;
|
||||
int err;
|
||||
|
||||
blkcipher_walk_init(&walk, dst, src, nbytes);
|
||||
|
||||
err = blkcipher_walk_virt(desc, &walk);
|
||||
nbytes = walk.nbytes;
|
||||
if (!nbytes)
|
||||
return err;
|
||||
|
||||
/* set minimum length to bsize, for tweak_fn */
|
||||
fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
|
||||
desc, fpu_enabled,
|
||||
nbytes < bsize ? bsize : nbytes);
|
||||
|
||||
/* calculate first value of T */
|
||||
tweak_fn(tweak_ctx, walk.iv, walk.iv);
|
||||
|
||||
while (nbytes) {
|
||||
nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
|
||||
|
||||
err = blkcipher_walk_done(desc, &walk, nbytes);
|
||||
nbytes = walk.nbytes;
|
||||
}
|
||||
|
||||
glue_fpu_end(fpu_enabled);
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
|
||||
|
||||
void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
|
||||
common_glue_func_t fn)
|
||||
{
|
||||
le128 ivblk = *iv;
|
||||
|
||||
/* generate next IV */
|
||||
le128_gf128mul_x_ble(iv, &ivblk);
|
||||
|
||||
/* CC <- T xor C */
|
||||
u128_xor(dst, src, (u128 *)&ivblk);
|
||||
|
||||
/* PP <- D(Key2,CC) */
|
||||
fn(ctx, (u8 *)dst, (u8 *)dst);
|
||||
|
||||
/* P <- T xor PP */
|
||||
u128_xor(dst, dst, (u128 *)&ivblk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
@ -4,8 +4,7 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
|
||||
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -34,6 +33,8 @@
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lxts_gf128mul_and_shl1_mask:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
.text
|
||||
|
||||
@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx)
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_ctr_8way_avx)
|
||||
|
||||
ENTRY(serpent_xts_enc_8way_avx)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
|
||||
RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __serpent_enc_blk8_avx;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_xts_enc_8way_avx)
|
||||
|
||||
ENTRY(serpent_xts_dec_8way_avx)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
|
||||
RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __serpent_dec_blk8_avx;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_xts_dec_8way_avx)
|
||||
|
800
arch/x86/crypto/serpent-avx2-asm_64.S
Normal file
800
arch/x86/crypto/serpent-avx2-asm_64.S
Normal file
@ -0,0 +1,800 @@
|
||||
/*
|
||||
* x86_64/AVX2 assembler optimized version of Serpent
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* Based on AVX assembler implementation of Serpent by:
|
||||
* Copyright © 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include "glue_helper-asm-avx2.S"
|
||||
|
||||
.file "serpent-avx2-asm_64.S"
|
||||
|
||||
.data
|
||||
.align 16
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lxts_gf128mul_and_shl1_mask_0:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
.Lxts_gf128mul_and_shl1_mask_1:
|
||||
.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
.text
|
||||
|
||||
#define CTX %rdi
|
||||
|
||||
#define RNOT %ymm0
|
||||
#define tp %ymm1
|
||||
|
||||
#define RA1 %ymm2
|
||||
#define RA2 %ymm3
|
||||
#define RB1 %ymm4
|
||||
#define RB2 %ymm5
|
||||
#define RC1 %ymm6
|
||||
#define RC2 %ymm7
|
||||
#define RD1 %ymm8
|
||||
#define RD2 %ymm9
|
||||
#define RE1 %ymm10
|
||||
#define RE2 %ymm11
|
||||
|
||||
#define RK0 %ymm12
|
||||
#define RK1 %ymm13
|
||||
#define RK2 %ymm14
|
||||
#define RK3 %ymm15
|
||||
|
||||
#define RK0x %xmm12
|
||||
#define RK1x %xmm13
|
||||
#define RK2x %xmm14
|
||||
#define RK3x %xmm15
|
||||
|
||||
#define S0_1(x0, x1, x2, x3, x4) \
|
||||
vpor x0, x3, tp; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpxor x2, x3, x4; \
|
||||
vpxor RNOT, x4, x4; \
|
||||
vpxor x1, tp, x3; \
|
||||
vpand x0, x1, x1; \
|
||||
vpxor x4, x1, x1; \
|
||||
vpxor x0, x2, x2;
|
||||
#define S0_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x3, x0, x0; \
|
||||
vpor x0, x4, x4; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpand x1, x2, x2; \
|
||||
vpxor x2, x3, x3; \
|
||||
vpxor RNOT, x1, x1; \
|
||||
vpxor x4, x2, x2; \
|
||||
vpxor x2, x1, x1;
|
||||
|
||||
#define S1_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x0, x1, tp; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpxor RNOT, x3, x3; \
|
||||
vpand tp, x1, x4; \
|
||||
vpor tp, x0, x0; \
|
||||
vpxor x2, x3, x3; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpxor x3, tp, x1;
|
||||
#define S1_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x4, x3, x3; \
|
||||
vpor x4, x1, x1; \
|
||||
vpxor x2, x4, x4; \
|
||||
vpand x0, x2, x2; \
|
||||
vpxor x1, x2, x2; \
|
||||
vpor x0, x1, x1; \
|
||||
vpxor RNOT, x0, x0; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpxor x1, x4, x4;
|
||||
|
||||
#define S2_1(x0, x1, x2, x3, x4) \
|
||||
vpxor RNOT, x3, x3; \
|
||||
vpxor x0, x1, x1; \
|
||||
vpand x2, x0, tp; \
|
||||
vpxor x3, tp, tp; \
|
||||
vpor x0, x3, x3; \
|
||||
vpxor x1, x2, x2; \
|
||||
vpxor x1, x3, x3; \
|
||||
vpand tp, x1, x1;
|
||||
#define S2_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x2, tp, tp; \
|
||||
vpand x3, x2, x2; \
|
||||
vpor x1, x3, x3; \
|
||||
vpxor RNOT, tp, tp; \
|
||||
vpxor tp, x3, x3; \
|
||||
vpxor tp, x0, x4; \
|
||||
vpxor x2, tp, x0; \
|
||||
vpor x2, x1, x1;
|
||||
|
||||
#define S3_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x3, x1, tp; \
|
||||
vpor x0, x3, x3; \
|
||||
vpand x0, x1, x4; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpxor tp, x2, x2; \
|
||||
vpand x3, tp, x1; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpor x4, x0, x0; \
|
||||
vpxor x3, x4, x4;
|
||||
#define S3_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x0, x1, x1; \
|
||||
vpand x3, x0, x0; \
|
||||
vpand x4, x3, x3; \
|
||||
vpxor x2, x3, x3; \
|
||||
vpor x1, x4, x4; \
|
||||
vpand x1, x2, x2; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpxor x2, x3, x3;
|
||||
|
||||
#define S4_1(x0, x1, x2, x3, x4) \
|
||||
vpand x0, x3, tp; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpxor x2, tp, tp; \
|
||||
vpor x3, x2, x2; \
|
||||
vpxor x1, x0, x0; \
|
||||
vpxor tp, x3, x4; \
|
||||
vpor x0, x2, x2; \
|
||||
vpxor x1, x2, x2;
|
||||
#define S4_2(x0, x1, x2, x3, x4) \
|
||||
vpand x0, x1, x1; \
|
||||
vpxor x4, x1, x1; \
|
||||
vpand x2, x4, x4; \
|
||||
vpxor tp, x2, x2; \
|
||||
vpxor x0, x4, x4; \
|
||||
vpor x1, tp, x3; \
|
||||
vpxor RNOT, x1, x1; \
|
||||
vpxor x0, x3, x3;
|
||||
|
||||
#define S5_1(x0, x1, x2, x3, x4) \
|
||||
vpor x0, x1, tp; \
|
||||
vpxor tp, x2, x2; \
|
||||
vpxor RNOT, x3, x3; \
|
||||
vpxor x0, x1, x4; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpand x4, tp, x1; \
|
||||
vpor x3, x4, x4; \
|
||||
vpxor x0, x4, x4;
|
||||
#define S5_2(x0, x1, x2, x3, x4) \
|
||||
vpand x3, x0, x0; \
|
||||
vpxor x3, x1, x1; \
|
||||
vpxor x2, x3, x3; \
|
||||
vpxor x1, x0, x0; \
|
||||
vpand x4, x2, x2; \
|
||||
vpxor x2, x1, x1; \
|
||||
vpand x0, x2, x2; \
|
||||
vpxor x2, x3, x3;
|
||||
|
||||
#define S6_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x0, x3, x3; \
|
||||
vpxor x2, x1, tp; \
|
||||
vpxor x0, x2, x2; \
|
||||
vpand x3, x0, x0; \
|
||||
vpor x3, tp, tp; \
|
||||
vpxor RNOT, x1, x4; \
|
||||
vpxor tp, x0, x0; \
|
||||
vpxor x2, tp, x1;
|
||||
#define S6_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x4, x3, x3; \
|
||||
vpxor x0, x4, x4; \
|
||||
vpand x0, x2, x2; \
|
||||
vpxor x1, x4, x4; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpand x1, x3, x3; \
|
||||
vpxor x0, x3, x3; \
|
||||
vpxor x2, x1, x1;
|
||||
|
||||
#define S7_1(x0, x1, x2, x3, x4) \
|
||||
vpxor RNOT, x1, tp; \
|
||||
vpxor RNOT, x0, x0; \
|
||||
vpand x2, tp, x1; \
|
||||
vpxor x3, x1, x1; \
|
||||
vpor tp, x3, x3; \
|
||||
vpxor x2, tp, x4; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x0, x3, x3; \
|
||||
vpor x1, x0, x0;
|
||||
#define S7_2(x0, x1, x2, x3, x4) \
|
||||
vpand x0, x2, x2; \
|
||||
vpxor x4, x0, x0; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpand x0, x3, x3; \
|
||||
vpxor x1, x4, x4; \
|
||||
vpxor x4, x2, x2; \
|
||||
vpxor x1, x3, x3; \
|
||||
vpor x0, x4, x4; \
|
||||
vpxor x1, x4, x4;
|
||||
|
||||
#define SI0_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x0, x1, x1; \
|
||||
vpor x1, x3, tp; \
|
||||
vpxor x1, x3, x4; \
|
||||
vpxor RNOT, x0, x0; \
|
||||
vpxor tp, x2, x2; \
|
||||
vpxor x0, tp, x3; \
|
||||
vpand x1, x0, x0; \
|
||||
vpxor x2, x0, x0;
|
||||
#define SI0_2(x0, x1, x2, x3, x4) \
|
||||
vpand x3, x2, x2; \
|
||||
vpxor x4, x3, x3; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x3, x1, x1; \
|
||||
vpand x0, x3, x3; \
|
||||
vpxor x0, x1, x1; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpxor x3, x4, x4;
|
||||
|
||||
#define SI1_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x3, x1, x1; \
|
||||
vpxor x2, x0, tp; \
|
||||
vpxor RNOT, x2, x2; \
|
||||
vpor x1, x0, x4; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpand x1, x3, x3; \
|
||||
vpxor x2, x1, x1; \
|
||||
vpand x4, x2, x2;
|
||||
#define SI1_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x1, x4, x4; \
|
||||
vpor x3, x1, x1; \
|
||||
vpxor tp, x3, x3; \
|
||||
vpxor tp, x2, x2; \
|
||||
vpor x4, tp, x0; \
|
||||
vpxor x4, x2, x2; \
|
||||
vpxor x0, x1, x1; \
|
||||
vpxor x1, x4, x4;
|
||||
|
||||
#define SI2_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x1, x2, x2; \
|
||||
vpxor RNOT, x3, tp; \
|
||||
vpor x2, tp, tp; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x0, x3, x4; \
|
||||
vpxor x1, tp, x3; \
|
||||
vpor x2, x1, x1; \
|
||||
vpxor x0, x2, x2;
|
||||
#define SI2_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x4, x1, x1; \
|
||||
vpor x3, x4, x4; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x2, x4, x4; \
|
||||
vpand x1, x2, x2; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x4, x3, x3; \
|
||||
vpxor x0, x4, x4;
|
||||
|
||||
#define SI3_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x1, x2, x2; \
|
||||
vpand x2, x1, tp; \
|
||||
vpxor x0, tp, tp; \
|
||||
vpor x1, x0, x0; \
|
||||
vpxor x3, x1, x4; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpor tp, x3, x3; \
|
||||
vpxor x2, tp, x1;
|
||||
#define SI3_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x3, x1, x1; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpand x1, x3, x3; \
|
||||
vpxor x0, x1, x1; \
|
||||
vpand x2, x0, x0; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpxor x0, x3, x3; \
|
||||
vpxor x1, x0, x0;
|
||||
|
||||
#define SI4_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x3, x2, x2; \
|
||||
vpand x1, x0, tp; \
|
||||
vpxor x2, tp, tp; \
|
||||
vpor x3, x2, x2; \
|
||||
vpxor RNOT, x0, x4; \
|
||||
vpxor tp, x1, x1; \
|
||||
vpxor x2, tp, x0; \
|
||||
vpand x4, x2, x2;
|
||||
#define SI4_2(x0, x1, x2, x3, x4) \
|
||||
vpxor x0, x2, x2; \
|
||||
vpor x4, x0, x0; \
|
||||
vpxor x3, x0, x0; \
|
||||
vpand x2, x3, x3; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpxor x1, x3, x3; \
|
||||
vpand x0, x1, x1; \
|
||||
vpxor x1, x4, x4; \
|
||||
vpxor x3, x0, x0;
|
||||
|
||||
#define SI5_1(x0, x1, x2, x3, x4) \
|
||||
vpor x2, x1, tp; \
|
||||
vpxor x1, x2, x2; \
|
||||
vpxor x3, tp, tp; \
|
||||
vpand x1, x3, x3; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpor x0, x3, x3; \
|
||||
vpxor RNOT, x0, x0; \
|
||||
vpxor x2, x3, x3; \
|
||||
vpor x0, x2, x2;
|
||||
#define SI5_2(x0, x1, x2, x3, x4) \
|
||||
vpxor tp, x1, x4; \
|
||||
vpxor x4, x2, x2; \
|
||||
vpand x0, x4, x4; \
|
||||
vpxor tp, x0, x0; \
|
||||
vpxor x3, tp, x1; \
|
||||
vpand x2, x0, x0; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpxor x4, x2, x2; \
|
||||
vpxor x3, x4, x4;
|
||||
|
||||
#define SI6_1(x0, x1, x2, x3, x4) \
|
||||
vpxor x2, x0, x0; \
|
||||
vpand x3, x0, tp; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpxor x2, tp, tp; \
|
||||
vpxor x1, x3, x3; \
|
||||
vpor x0, x2, x2; \
|
||||
vpxor x3, x2, x2; \
|
||||
vpand tp, x3, x3;
|
||||
#define SI6_2(x0, x1, x2, x3, x4) \
|
||||
vpxor RNOT, tp, tp; \
|
||||
vpxor x1, x3, x3; \
|
||||
vpand x2, x1, x1; \
|
||||
vpxor tp, x0, x4; \
|
||||
vpxor x4, x3, x3; \
|
||||
vpxor x2, x4, x4; \
|
||||
vpxor x1, tp, x0; \
|
||||
vpxor x0, x2, x2;
|
||||
|
||||
#define SI7_1(x0, x1, x2, x3, x4) \
|
||||
vpand x0, x3, tp; \
|
||||
vpxor x2, x0, x0; \
|
||||
vpor x3, x2, x2; \
|
||||
vpxor x1, x3, x4; \
|
||||
vpxor RNOT, x0, x0; \
|
||||
vpor tp, x1, x1; \
|
||||
vpxor x0, x4, x4; \
|
||||
vpand x2, x0, x0; \
|
||||
vpxor x1, x0, x0;
|
||||
#define SI7_2(x0, x1, x2, x3, x4) \
|
||||
vpand x2, x1, x1; \
|
||||
vpxor x2, tp, x3; \
|
||||
vpxor x3, x4, x4; \
|
||||
vpand x3, x2, x2; \
|
||||
vpor x0, x3, x3; \
|
||||
vpxor x4, x1, x1; \
|
||||
vpxor x4, x3, x3; \
|
||||
vpand x0, x4, x4; \
|
||||
vpxor x2, x4, x4;
|
||||
|
||||
#define get_key(i,j,t) \
|
||||
vpbroadcastd (4*(i)+(j))*4(CTX), t;
|
||||
|
||||
#define K2(x0, x1, x2, x3, x4, i) \
|
||||
get_key(i, 0, RK0); \
|
||||
get_key(i, 1, RK1); \
|
||||
get_key(i, 2, RK2); \
|
||||
get_key(i, 3, RK3); \
|
||||
vpxor RK0, x0 ## 1, x0 ## 1; \
|
||||
vpxor RK1, x1 ## 1, x1 ## 1; \
|
||||
vpxor RK2, x2 ## 1, x2 ## 1; \
|
||||
vpxor RK3, x3 ## 1, x3 ## 1; \
|
||||
vpxor RK0, x0 ## 2, x0 ## 2; \
|
||||
vpxor RK1, x1 ## 2, x1 ## 2; \
|
||||
vpxor RK2, x2 ## 2, x2 ## 2; \
|
||||
vpxor RK3, x3 ## 2, x3 ## 2;
|
||||
|
||||
#define LK2(x0, x1, x2, x3, x4, i) \
|
||||
vpslld $13, x0 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
|
||||
vpor x4 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpslld $3, x2 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
|
||||
vpor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpslld $13, x0 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
|
||||
vpor x4 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpslld $3, x2 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
|
||||
vpor x4 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpslld $1, x1 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
|
||||
vpor x4 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpslld $3, x0 ## 1, x4 ## 1; \
|
||||
vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
|
||||
vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
|
||||
get_key(i, 1, RK1); \
|
||||
vpslld $1, x1 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
|
||||
vpor x4 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpslld $3, x0 ## 2, x4 ## 2; \
|
||||
vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
|
||||
vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
|
||||
get_key(i, 3, RK3); \
|
||||
vpslld $7, x3 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
|
||||
vpor x4 ## 1, x3 ## 1, x3 ## 1; \
|
||||
vpslld $7, x1 ## 1, x4 ## 1; \
|
||||
vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
get_key(i, 0, RK0); \
|
||||
vpslld $7, x3 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
|
||||
vpor x4 ## 2, x3 ## 2, x3 ## 2; \
|
||||
vpslld $7, x1 ## 2, x4 ## 2; \
|
||||
vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
|
||||
get_key(i, 2, RK2); \
|
||||
vpxor RK1, x1 ## 1, x1 ## 1; \
|
||||
vpxor RK3, x3 ## 1, x3 ## 1; \
|
||||
vpslld $5, x0 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
|
||||
vpor x4 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpslld $22, x2 ## 1, x4 ## 1; \
|
||||
vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
|
||||
vpor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpxor RK0, x0 ## 1, x0 ## 1; \
|
||||
vpxor RK2, x2 ## 1, x2 ## 1; \
|
||||
vpxor RK1, x1 ## 2, x1 ## 2; \
|
||||
vpxor RK3, x3 ## 2, x3 ## 2; \
|
||||
vpslld $5, x0 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
|
||||
vpor x4 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpslld $22, x2 ## 2, x4 ## 2; \
|
||||
vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
|
||||
vpor x4 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpxor RK0, x0 ## 2, x0 ## 2; \
|
||||
vpxor RK2, x2 ## 2, x2 ## 2;
|
||||
|
||||
#define KL2(x0, x1, x2, x3, x4, i) \
|
||||
vpxor RK0, x0 ## 1, x0 ## 1; \
|
||||
vpxor RK2, x2 ## 1, x2 ## 1; \
|
||||
vpsrld $5, x0 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
|
||||
vpor x4 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor RK3, x3 ## 1, x3 ## 1; \
|
||||
vpxor RK1, x1 ## 1, x1 ## 1; \
|
||||
vpsrld $22, x2 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
|
||||
vpor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpxor RK0, x0 ## 2, x0 ## 2; \
|
||||
vpxor RK2, x2 ## 2, x2 ## 2; \
|
||||
vpsrld $5, x0 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
|
||||
vpor x4 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor RK3, x3 ## 2, x3 ## 2; \
|
||||
vpxor RK1, x1 ## 2, x1 ## 2; \
|
||||
vpsrld $22, x2 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
|
||||
vpor x4 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpslld $7, x1 ## 1, x4 ## 1; \
|
||||
vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpsrld $1, x1 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
|
||||
vpor x4 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpslld $7, x1 ## 2, x4 ## 2; \
|
||||
vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
|
||||
vpsrld $1, x1 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
|
||||
vpor x4 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpsrld $7, x3 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
|
||||
vpor x4 ## 1, x3 ## 1, x3 ## 1; \
|
||||
vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpslld $3, x0 ## 1, x4 ## 1; \
|
||||
vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
|
||||
vpsrld $7, x3 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
|
||||
vpor x4 ## 2, x3 ## 2, x3 ## 2; \
|
||||
vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpslld $3, x0 ## 2, x4 ## 2; \
|
||||
vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
|
||||
vpsrld $13, x0 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
|
||||
vpor x4 ## 1, x0 ## 1, x0 ## 1; \
|
||||
vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
|
||||
vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
|
||||
vpsrld $3, x2 ## 1, x4 ## 1; \
|
||||
vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
|
||||
vpor x4 ## 1, x2 ## 1, x2 ## 1; \
|
||||
vpsrld $13, x0 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
|
||||
vpor x4 ## 2, x0 ## 2, x0 ## 2; \
|
||||
vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
|
||||
vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
|
||||
vpsrld $3, x2 ## 2, x4 ## 2; \
|
||||
vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
|
||||
vpor x4 ## 2, x2 ## 2, x2 ## 2;
|
||||
|
||||
#define S(SBOX, x0, x1, x2, x3, x4) \
|
||||
SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
|
||||
SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
|
||||
SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
|
||||
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
|
||||
|
||||
#define SP(SBOX, x0, x1, x2, x3, x4, i) \
|
||||
get_key(i, 0, RK0); \
|
||||
SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
|
||||
get_key(i, 2, RK2); \
|
||||
SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
|
||||
get_key(i, 3, RK3); \
|
||||
SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
|
||||
get_key(i, 1, RK1); \
|
||||
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
|
||||
|
||||
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
vpunpckldq x1, x0, t0; \
|
||||
vpunpckhdq x1, x0, t2; \
|
||||
vpunpckldq x3, x2, t1; \
|
||||
vpunpckhdq x3, x2, x3; \
|
||||
\
|
||||
vpunpcklqdq t1, t0, x0; \
|
||||
vpunpckhqdq t1, t0, x1; \
|
||||
vpunpcklqdq x3, t2, x2; \
|
||||
vpunpckhqdq x3, t2, x3;
|
||||
|
||||
#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
.align 8
|
||||
__serpent_enc_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
|
||||
* output:
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
|
||||
*/
|
||||
|
||||
vpcmpeqd RNOT, RNOT, RNOT;
|
||||
|
||||
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
K2(RA, RB, RC, RD, RE, 0);
|
||||
S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
|
||||
S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
|
||||
S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
|
||||
S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
|
||||
S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
|
||||
S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
|
||||
S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
|
||||
S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
|
||||
S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
|
||||
S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
|
||||
S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
|
||||
S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
|
||||
S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
|
||||
S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
|
||||
S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
|
||||
S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
|
||||
S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
|
||||
S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
|
||||
S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
|
||||
S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
|
||||
S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
|
||||
S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
|
||||
S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
|
||||
S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
|
||||
S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
|
||||
S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
|
||||
S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
|
||||
S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
|
||||
S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
|
||||
S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
|
||||
S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
|
||||
S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
|
||||
|
||||
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
ret;
|
||||
ENDPROC(__serpent_enc_blk16)
|
||||
|
||||
.align 8
|
||||
__serpent_dec_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
|
||||
* output:
|
||||
* RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
|
||||
*/
|
||||
|
||||
vpcmpeqd RNOT, RNOT, RNOT;
|
||||
|
||||
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
K2(RA, RB, RC, RD, RE, 32);
|
||||
SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
|
||||
SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
|
||||
SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
|
||||
SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
|
||||
SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
|
||||
SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
|
||||
SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
|
||||
SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
|
||||
SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
|
||||
SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
|
||||
SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
|
||||
SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
|
||||
SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
|
||||
SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
|
||||
SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
|
||||
SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
|
||||
SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
|
||||
SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
|
||||
SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
|
||||
SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
|
||||
SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
|
||||
SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
|
||||
SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
|
||||
SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
|
||||
SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
|
||||
SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
|
||||
SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
|
||||
SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
|
||||
SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
|
||||
SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
|
||||
SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
|
||||
S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
|
||||
|
||||
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
|
||||
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
|
||||
|
||||
ret;
|
||||
ENDPROC(__serpent_dec_blk16)
|
||||
|
||||
ENTRY(serpent_ecb_enc_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_enc_blk16;
|
||||
|
||||
store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_ecb_enc_16way)
|
||||
|
||||
ENTRY(serpent_ecb_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_dec_blk16;
|
||||
|
||||
store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_ecb_dec_16way)
|
||||
|
||||
ENTRY(serpent_cbc_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_dec_blk16;
|
||||
|
||||
store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
|
||||
RK0);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_cbc_dec_16way)
|
||||
|
||||
ENTRY(serpent_ctr_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (little endian, 128bit)
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
|
||||
tp);
|
||||
|
||||
call __serpent_enc_blk16;
|
||||
|
||||
store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_ctr_16way)
|
||||
|
||||
ENTRY(serpent_xts_enc_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
|
||||
.Lxts_gf128mul_and_shl1_mask_0,
|
||||
.Lxts_gf128mul_and_shl1_mask_1);
|
||||
|
||||
call __serpent_enc_blk16;
|
||||
|
||||
store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_xts_enc_16way)
|
||||
|
||||
ENTRY(serpent_xts_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
|
||||
load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
|
||||
.Lxts_gf128mul_and_shl1_mask_0,
|
||||
.Lxts_gf128mul_and_shl1_mask_1);
|
||||
|
||||
call __serpent_dec_blk16;
|
||||
|
||||
store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
|
||||
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(serpent_xts_dec_16way)
|
562
arch/x86/crypto/serpent_avx2_glue.c
Normal file
562
arch/x86/crypto/serpent_avx2_glue.c
Normal file
@ -0,0 +1,562 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX2 assembler optimized version of Serpent
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/err.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <crypto/lrw.h>
|
||||
#include <crypto/xts.h>
|
||||
#include <crypto/serpent.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <asm/crypto/serpent-avx.h>
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
#define SERPENT_AVX2_PARALLEL_BLOCKS 16
|
||||
|
||||
/* 16-way AVX2 parallel cipher functions */
|
||||
asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
|
||||
|
||||
asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static const struct common_glue_ctx serpent_enc = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_ctr = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_enc_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_dec = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_dec_cbc = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_dec_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
|
||||
dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
|
||||
return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
|
||||
}
|
||||
|
||||
static inline void serpent_fpu_end(bool fpu_enabled)
|
||||
{
|
||||
glue_fpu_end(fpu_enabled);
|
||||
}
|
||||
|
||||
struct crypt_priv {
|
||||
struct serpent_ctx *ctx;
|
||||
bool fpu_enabled;
|
||||
};
|
||||
|
||||
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = SERPENT_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
|
||||
serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
__serpent_encrypt(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = SERPENT_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
|
||||
serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
__serpent_decrypt(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->serpent_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
serpent_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->serpent_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
serpent_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg srp_algs[10] = { {
|
||||
.cra_name = "__ecb-serpent-avx2",
|
||||
.cra_driver_name = "__driver-ecb-serpent-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct serpent_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list),
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.setkey = serpent_setkey,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-serpent-avx2",
|
||||
.cra_driver_name = "__driver-cbc-serpent-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct serpent_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list),
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.setkey = serpent_setkey,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-serpent-avx2",
|
||||
.cra_driver_name = "__driver-ctr-serpent-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct serpent_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list),
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = serpent_setkey,
|
||||
.encrypt = ctr_crypt,
|
||||
.decrypt = ctr_crypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__lrw-serpent-avx2",
|
||||
.cra_driver_name = "__driver-lrw-serpent-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct serpent_lrw_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list),
|
||||
.cra_exit = lrw_serpent_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE +
|
||||
SERPENT_BLOCK_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE +
|
||||
SERPENT_BLOCK_SIZE,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = lrw_serpent_setkey,
|
||||
.encrypt = lrw_encrypt,
|
||||
.decrypt = lrw_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__xts-serpent-avx2",
|
||||
.cra_driver_name = "__driver-xts-serpent-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct serpent_xts_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list),
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE * 2,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = xts_serpent_setkey,
|
||||
.encrypt = xts_encrypt,
|
||||
.decrypt = xts_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(serpent)",
|
||||
.cra_driver_name = "ecb-serpent-avx2",
|
||||
.cra_priority = 600,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list),
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "cbc(serpent)",
|
||||
.cra_driver_name = "cbc-serpent-avx2",
|
||||
.cra_priority = 600,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list),
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = __ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ctr(serpent)",
|
||||
.cra_driver_name = "ctr-serpent-avx2",
|
||||
.cra_priority = 600,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list),
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_encrypt,
|
||||
.geniv = "chainiv",
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "lrw(serpent)",
|
||||
.cra_driver_name = "lrw-serpent-avx2",
|
||||
.cra_priority = 600,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list),
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE +
|
||||
SERPENT_BLOCK_SIZE,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE +
|
||||
SERPENT_BLOCK_SIZE,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "xts(serpent)",
|
||||
.cra_driver_name = "xts-serpent-avx2",
|
||||
.cra_priority = 600,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = SERPENT_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list),
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = SERPENT_MAX_KEY_SIZE * 2,
|
||||
.ivsize = SERPENT_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
} };
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx2 || !cpu_has_osxsave) {
|
||||
pr_info("AVX2 instructions are not detected.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX detected but unusable.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
|
||||
}
|
||||
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
|
||||
MODULE_ALIAS("serpent");
|
||||
MODULE_ALIAS("serpent-asm");
|
@ -4,8 +4,7 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Glue code based on serpent_sse2_glue.c by:
|
||||
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -42,7 +41,32 @@
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
/* 8-way parallel cipher functions */
|
||||
asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
|
||||
|
||||
asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
|
||||
|
||||
asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
|
||||
|
||||
asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
|
||||
|
||||
asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
|
||||
|
||||
asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
|
||||
|
||||
void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
u128_xor(dst, src, (u128 *)&ctrblk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
|
||||
|
||||
void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(__serpent_encrypt));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(serpent_xts_enc);
|
||||
|
||||
void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(__serpent_decrypt));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(serpent_xts_dec);
|
||||
|
||||
|
||||
static const struct common_glue_ctx serpent_enc = {
|
||||
.num_funcs = 2,
|
||||
@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = {
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_enc_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx serpent_dec_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
__serpent_decrypt(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
struct serpent_lrw_ctx {
|
||||
struct lrw_table_ctx lrw_table;
|
||||
struct serpent_ctx serpent_ctx;
|
||||
};
|
||||
|
||||
static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
int err;
|
||||
@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
return lrw_init_table(&ctx->lrw_table, key + keylen -
|
||||
SERPENT_BLOCK_SIZE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void lrw_exit_tfm(struct crypto_tfm *tfm)
|
||||
void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
|
||||
lrw_free_table(&ctx->lrw_table);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
|
||||
|
||||
struct serpent_xts_ctx {
|
||||
struct serpent_ctx tweak_ctx;
|
||||
struct serpent_ctx crypt_ctx;
|
||||
};
|
||||
|
||||
static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
u32 *flags = &tfm->crt_flags;
|
||||
@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
/* second half of xts-key is for tweak */
|
||||
return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xts_serpent_setkey);
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[SERPENT_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
serpent_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[SERPENT_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
serpent_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(__serpent_encrypt),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg serpent_algs[10] = { {
|
||||
@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_exit = lrw_exit_tfm,
|
||||
.cra_exit = lrw_serpent_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = SERPENT_MIN_KEY_SIZE +
|
||||
|
496
arch/x86/crypto/sha256-avx-asm.S
Normal file
496
arch/x86/crypto/sha256-avx-asm.S
Normal file
@ -0,0 +1,496 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-256 with AVX1 instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
# This code schedules 1 block at a time, with 4 lanes per block
|
||||
########################################################################
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
#include <linux/linkage.h>
|
||||
|
||||
## assume buffers not aligned
|
||||
#define VMOVDQ vmovdqu
|
||||
|
||||
################################ Define Macros
|
||||
|
||||
# addm [mem], reg
|
||||
# Add reg to mem using reg-mem add and store
|
||||
.macro addm p1 p2
|
||||
add \p1, \p2
|
||||
mov \p2, \p1
|
||||
.endm
|
||||
|
||||
|
||||
.macro MY_ROR p1 p2
|
||||
shld $(32-(\p1)), \p2, \p2
|
||||
.endm
|
||||
|
||||
################################
|
||||
|
||||
# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
# Load xmm with mem and byte swap each dword
|
||||
.macro COPY_XMM_AND_BSWAP p1 p2 p3
|
||||
VMOVDQ \p2, \p1
|
||||
vpshufb \p3, \p1, \p1
|
||||
.endm
|
||||
|
||||
################################
|
||||
|
||||
X0 = %xmm4
|
||||
X1 = %xmm5
|
||||
X2 = %xmm6
|
||||
X3 = %xmm7
|
||||
|
||||
XTMP0 = %xmm0
|
||||
XTMP1 = %xmm1
|
||||
XTMP2 = %xmm2
|
||||
XTMP3 = %xmm3
|
||||
XTMP4 = %xmm8
|
||||
XFER = %xmm9
|
||||
XTMP5 = %xmm11
|
||||
|
||||
SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
|
||||
SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
|
||||
BYTE_FLIP_MASK = %xmm13
|
||||
|
||||
NUM_BLKS = %rdx # 3rd arg
|
||||
CTX = %rsi # 2nd arg
|
||||
INP = %rdi # 1st arg
|
||||
|
||||
SRND = %rdi # clobbers INP
|
||||
c = %ecx
|
||||
d = %r8d
|
||||
e = %edx
|
||||
TBL = %rbp
|
||||
a = %eax
|
||||
b = %ebx
|
||||
|
||||
f = %r9d
|
||||
g = %r10d
|
||||
h = %r11d
|
||||
|
||||
y0 = %r13d
|
||||
y1 = %r14d
|
||||
y2 = %r15d
|
||||
|
||||
|
||||
_INP_END_SIZE = 8
|
||||
_INP_SIZE = 8
|
||||
_XFER_SIZE = 8
|
||||
_XMM_SAVE_SIZE = 0
|
||||
|
||||
_INP_END = 0
|
||||
_INP = _INP_END + _INP_END_SIZE
|
||||
_XFER = _INP + _INP_SIZE
|
||||
_XMM_SAVE = _XFER + _XFER_SIZE
|
||||
STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
# rotate_Xs
|
||||
# Rotate values of symbols X0...X3
|
||||
.macro rotate_Xs
|
||||
X_ = X0
|
||||
X0 = X1
|
||||
X1 = X2
|
||||
X2 = X3
|
||||
X3 = X_
|
||||
.endm
|
||||
|
||||
# ROTATE_ARGS
|
||||
# Rotate values of symbols a...h
|
||||
.macro ROTATE_ARGS
|
||||
TMP_ = h
|
||||
h = g
|
||||
g = f
|
||||
f = e
|
||||
e = d
|
||||
d = c
|
||||
c = b
|
||||
b = a
|
||||
a = TMP_
|
||||
.endm
|
||||
|
||||
.macro FOUR_ROUNDS_AND_SCHED
|
||||
## compute s0 four at a time and s1 two at a time
|
||||
## compute W[-16] + W[-7] 4 at a time
|
||||
|
||||
mov e, y0 # y0 = e
|
||||
MY_ROR (25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
|
||||
MY_ROR (22-13), y1 # y1 = a >> (22-13)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
xor g, y2 # y2 = f^g
|
||||
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
## compute s0
|
||||
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
vpsrld $7, XTMP1, XTMP2
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
vpslld $(32-7), XTMP1, XTMP3
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
mov e, y0 # y0 = e
|
||||
mov a, y1 # y1 = a
|
||||
MY_ROR (25-11), y0 # y0 = e >> (25-11)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
MY_ROR (22-13), y1 # y1 = a >> (22-13)
|
||||
vpsrld $18, XTMP1, XTMP2 #
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor g, y2 # y2 = f^g
|
||||
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
|
||||
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpslld $(32-18), XTMP1, XTMP1
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
vpxor XTMP1, XTMP3, XTMP3 #
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
## compute low s1
|
||||
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
mov e, y0 # y0 = e
|
||||
mov a, y1 # y1 = a
|
||||
MY_ROR (25-11), y0 # y0 = e >> (25-11)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
MY_ROR (22-13), y1 # y1 = a >> (22-13)
|
||||
mov f, y2 # y2 = f
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
|
||||
xor g, y2 # y2 = f^g
|
||||
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
|
||||
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
vpxor XTMP3, XTMP2, XTMP2 #
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
## compute high s1
|
||||
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
mov e, y0 # y0 = e
|
||||
MY_ROR (25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
MY_ROR (22-13), y1 # y1 = a >> (22-13)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
xor g, y2 # y2 = f^g
|
||||
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
vpxor XTMP3, XTMP2, XTMP2
|
||||
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
.endm
|
||||
|
||||
## input is [rsp + _XFER + %1 * 4]
|
||||
.macro DO_ROUND round
|
||||
mov e, y0 # y0 = e
|
||||
MY_ROR (25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
MY_ROR (22-13), y1 # y1 = a >> (22-13)
|
||||
mov f, y2 # y2 = f
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor g, y2 # y2 = f^g
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
offset = \round * 4 + _XFER #
|
||||
add offset(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
## arg 1 : pointer to input data
|
||||
## arg 2 : pointer to digest
|
||||
## arg 3 : Num blocks
|
||||
########################################################################
|
||||
.text
|
||||
ENTRY(sha256_transform_avx)
|
||||
.align 32
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
pushq %r12
|
||||
|
||||
mov %rsp, %r12
|
||||
subq $STACK_SIZE, %rsp # allocate stack space
|
||||
and $~15, %rsp # align stack pointer
|
||||
|
||||
shl $6, NUM_BLKS # convert to bytes
|
||||
jz done_hash
|
||||
add INP, NUM_BLKS # pointer to end of data
|
||||
mov NUM_BLKS, _INP_END(%rsp)
|
||||
|
||||
## load initial digest
|
||||
mov 4*0(CTX), a
|
||||
mov 4*1(CTX), b
|
||||
mov 4*2(CTX), c
|
||||
mov 4*3(CTX), d
|
||||
mov 4*4(CTX), e
|
||||
mov 4*5(CTX), f
|
||||
mov 4*6(CTX), g
|
||||
mov 4*7(CTX), h
|
||||
|
||||
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||
loop0:
|
||||
lea K256(%rip), TBL
|
||||
|
||||
## byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
|
||||
|
||||
mov INP, _INP(%rsp)
|
||||
|
||||
## schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov $3, SRND
|
||||
.align 16
|
||||
loop1:
|
||||
vpaddd (TBL), X0, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd 1*16(TBL), X0, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd 2*16(TBL), X0, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddd 3*16(TBL), X0, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
add $4*16, TBL
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub $1, SRND
|
||||
jne loop1
|
||||
|
||||
mov $2, SRND
|
||||
loop2:
|
||||
vpaddd (TBL), X0, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vpaddd 1*16(TBL), X1, XFER
|
||||
vmovdqa XFER, _XFER(%rsp)
|
||||
add $2*16, TBL
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
vmovdqa X2, X0
|
||||
vmovdqa X3, X1
|
||||
|
||||
sub $1, SRND
|
||||
jne loop2
|
||||
|
||||
addm (4*0)(CTX),a
|
||||
addm (4*1)(CTX),b
|
||||
addm (4*2)(CTX),c
|
||||
addm (4*3)(CTX),d
|
||||
addm (4*4)(CTX),e
|
||||
addm (4*5)(CTX),f
|
||||
addm (4*6)(CTX),g
|
||||
addm (4*7)(CTX),h
|
||||
|
||||
mov _INP(%rsp), INP
|
||||
add $64, INP
|
||||
cmp _INP_END(%rsp), INP
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
|
||||
mov %r12, %rsp
|
||||
|
||||
popq %r12
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %rbp
|
||||
popq %rbx
|
||||
ret
|
||||
ENDPROC(sha256_transform_avx)
|
||||
|
||||
.data
|
||||
.align 64
|
||||
K256:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
# shuffle xBxA -> 00BA
|
||||
_SHUF_00BA:
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
# shuffle xDxC -> DC00
|
||||
_SHUF_DC00:
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
#endif
|
772
arch/x86/crypto/sha256-avx2-asm.S
Normal file
772
arch/x86/crypto/sha256-avx2-asm.S
Normal file
@ -0,0 +1,772 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-256 with AVX2 instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
# This code schedules 2 blocks at a time, with 4 lanes per block
|
||||
########################################################################
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
#include <linux/linkage.h>
|
||||
|
||||
## assume buffers not aligned
|
||||
#define VMOVDQ vmovdqu
|
||||
|
||||
################################ Define Macros
|
||||
|
||||
# addm [mem], reg
|
||||
# Add reg to mem using reg-mem add and store
|
||||
.macro addm p1 p2
|
||||
add \p1, \p2
|
||||
mov \p2, \p1
|
||||
.endm
|
||||
|
||||
################################
|
||||
|
||||
X0 = %ymm4
|
||||
X1 = %ymm5
|
||||
X2 = %ymm6
|
||||
X3 = %ymm7
|
||||
|
||||
# XMM versions of above
|
||||
XWORD0 = %xmm4
|
||||
XWORD1 = %xmm5
|
||||
XWORD2 = %xmm6
|
||||
XWORD3 = %xmm7
|
||||
|
||||
XTMP0 = %ymm0
|
||||
XTMP1 = %ymm1
|
||||
XTMP2 = %ymm2
|
||||
XTMP3 = %ymm3
|
||||
XTMP4 = %ymm8
|
||||
XFER = %ymm9
|
||||
XTMP5 = %ymm11
|
||||
|
||||
SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
|
||||
SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
|
||||
BYTE_FLIP_MASK = %ymm13
|
||||
|
||||
X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
|
||||
|
||||
NUM_BLKS = %rdx # 3rd arg
|
||||
CTX = %rsi # 2nd arg
|
||||
INP = %rdi # 1st arg
|
||||
c = %ecx
|
||||
d = %r8d
|
||||
e = %edx # clobbers NUM_BLKS
|
||||
y3 = %edi # clobbers INP
|
||||
|
||||
|
||||
TBL = %rbp
|
||||
SRND = CTX # SRND is same register as CTX
|
||||
|
||||
a = %eax
|
||||
b = %ebx
|
||||
f = %r9d
|
||||
g = %r10d
|
||||
h = %r11d
|
||||
old_h = %r11d
|
||||
|
||||
T1 = %r12d
|
||||
y0 = %r13d
|
||||
y1 = %r14d
|
||||
y2 = %r15d
|
||||
|
||||
|
||||
_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
|
||||
_XMM_SAVE_SIZE = 0
|
||||
_INP_END_SIZE = 8
|
||||
_INP_SIZE = 8
|
||||
_CTX_SIZE = 8
|
||||
_RSP_SIZE = 8
|
||||
|
||||
_XFER = 0
|
||||
_XMM_SAVE = _XFER + _XFER_SIZE
|
||||
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
_INP = _INP_END + _INP_END_SIZE
|
||||
_CTX = _INP + _INP_SIZE
|
||||
_RSP = _CTX + _CTX_SIZE
|
||||
STACK_SIZE = _RSP + _RSP_SIZE
|
||||
|
||||
# rotate_Xs
|
||||
# Rotate values of symbols X0...X3
|
||||
.macro rotate_Xs
|
||||
X_ = X0
|
||||
X0 = X1
|
||||
X1 = X2
|
||||
X2 = X3
|
||||
X3 = X_
|
||||
.endm
|
||||
|
||||
# ROTATE_ARGS
|
||||
# Rotate values of symbols a...h
|
||||
.macro ROTATE_ARGS
|
||||
old_h = h
|
||||
TMP_ = h
|
||||
h = g
|
||||
g = f
|
||||
f = e
|
||||
e = d
|
||||
d = c
|
||||
c = b
|
||||
b = a
|
||||
a = TMP_
|
||||
.endm
|
||||
|
||||
.macro FOUR_ROUNDS_AND_SCHED disp
|
||||
################################### RND N + 0 ############################
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
|
||||
addl \disp(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
add h, d # d = k + w + h + d # --
|
||||
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
vpsrld $7, XTMP1, XTMP2
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
vpslld $(32-7), XTMP1, XTMP3
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
|
||||
|
||||
vpsrld $18, XTMP1, XTMP2
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 1 ############################
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
offset = \disp + 1*4
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
|
||||
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add h, d # d = k + w + h + d # --
|
||||
|
||||
vpslld $(32-18), XTMP1, XTMP1
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
|
||||
vpxor XTMP1, XTMP3, XTMP3
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
|
||||
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 2 ############################
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
offset = \disp + 2*4
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
|
||||
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
mov f, y2 # y2 = f # CH
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
vpxor XTMP3, XTMP2, XTMP2
|
||||
add h, d # d = k + w + h + d # --
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a ,T1 # T1 = (a >> 2) # S0
|
||||
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
|
||||
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1,h # h = k + w + h + S0 # --
|
||||
add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
|
||||
add y3,h # h = t1 + S0 + MAJ # --
|
||||
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 3 ############################
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
offset = \disp + 3*4
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
|
||||
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
|
||||
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add h, d # d = k + w + h + d # --
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
|
||||
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
vpxor XTMP3, XTMP2, XTMP2
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
|
||||
|
||||
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
.endm
|
||||
|
||||
.macro DO_4ROUNDS disp
|
||||
################################### RND N + 0 ###########################
|
||||
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
addl \disp(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 1 ###########################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
offset = 4*1 + \disp
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 2 ##############################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
offset = 4*2 + \disp
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
################################### RND N + 3 ###########################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||
offset = 4*3 + \disp
|
||||
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
ROTATE_ARGS
|
||||
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
## arg 1 : pointer to input data
|
||||
## arg 2 : pointer to digest
|
||||
## arg 3 : Num blocks
|
||||
########################################################################
|
||||
.text
|
||||
ENTRY(sha256_transform_rorx)
|
||||
.align 32
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
|
||||
mov %rsp, %rax
|
||||
subq $STACK_SIZE, %rsp
|
||||
and $-32, %rsp # align rsp to 32 byte boundary
|
||||
mov %rax, _RSP(%rsp)
|
||||
|
||||
|
||||
shl $6, NUM_BLKS # convert to bytes
|
||||
jz done_hash
|
||||
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
|
||||
mov NUM_BLKS, _INP_END(%rsp)
|
||||
|
||||
cmp NUM_BLKS, INP
|
||||
je only_one_block
|
||||
|
||||
## load initial digest
|
||||
mov (CTX), a
|
||||
mov 4*1(CTX), b
|
||||
mov 4*2(CTX), c
|
||||
mov 4*3(CTX), d
|
||||
mov 4*4(CTX), e
|
||||
mov 4*5(CTX), f
|
||||
mov 4*6(CTX), g
|
||||
mov 4*7(CTX), h
|
||||
|
||||
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||
|
||||
mov CTX, _CTX(%rsp)
|
||||
|
||||
loop0:
|
||||
lea K256(%rip), TBL
|
||||
|
||||
## Load first 16 dwords from two blocks
|
||||
VMOVDQ 0*32(INP),XTMP0
|
||||
VMOVDQ 1*32(INP),XTMP1
|
||||
VMOVDQ 2*32(INP),XTMP2
|
||||
VMOVDQ 3*32(INP),XTMP3
|
||||
|
||||
## byte swap data
|
||||
vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
|
||||
vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
|
||||
vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
|
||||
vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
|
||||
|
||||
## transpose data into high/low halves
|
||||
vperm2i128 $0x20, XTMP2, XTMP0, X0
|
||||
vperm2i128 $0x31, XTMP2, XTMP0, X1
|
||||
vperm2i128 $0x20, XTMP3, XTMP1, X2
|
||||
vperm2i128 $0x31, XTMP3, XTMP1, X3
|
||||
|
||||
last_block_enter:
|
||||
add $64, INP
|
||||
mov INP, _INP(%rsp)
|
||||
|
||||
## schedule 48 input dwords, by doing 3 rounds of 12 each
|
||||
xor SRND, SRND
|
||||
|
||||
.align 16
|
||||
loop1:
|
||||
vpaddd 0*32(TBL, SRND), X0, XFER
|
||||
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
|
||||
|
||||
vpaddd 1*32(TBL, SRND), X0, XFER
|
||||
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
|
||||
|
||||
vpaddd 2*32(TBL, SRND), X0, XFER
|
||||
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
|
||||
|
||||
vpaddd 3*32(TBL, SRND), X0, XFER
|
||||
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
|
||||
|
||||
add $4*32, SRND
|
||||
cmp $3*4*32, SRND
|
||||
jb loop1
|
||||
|
||||
loop2:
|
||||
## Do last 16 rounds with no scheduling
|
||||
vpaddd 0*32(TBL, SRND), X0, XFER
|
||||
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||
DO_4ROUNDS _XFER + 0*32
|
||||
vpaddd 1*32(TBL, SRND), X1, XFER
|
||||
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||
DO_4ROUNDS _XFER + 1*32
|
||||
add $2*32, SRND
|
||||
|
||||
vmovdqa X2, X0
|
||||
vmovdqa X3, X1
|
||||
|
||||
cmp $4*4*32, SRND
|
||||
jb loop2
|
||||
|
||||
mov _CTX(%rsp), CTX
|
||||
mov _INP(%rsp), INP
|
||||
|
||||
addm (4*0)(CTX),a
|
||||
addm (4*1)(CTX),b
|
||||
addm (4*2)(CTX),c
|
||||
addm (4*3)(CTX),d
|
||||
addm (4*4)(CTX),e
|
||||
addm (4*5)(CTX),f
|
||||
addm (4*6)(CTX),g
|
||||
addm (4*7)(CTX),h
|
||||
|
||||
cmp _INP_END(%rsp), INP
|
||||
ja done_hash
|
||||
|
||||
#### Do second block using previously scheduled results
|
||||
xor SRND, SRND
|
||||
.align 16
|
||||
loop3:
|
||||
DO_4ROUNDS _XFER + 0*32 + 16
|
||||
DO_4ROUNDS _XFER + 1*32 + 16
|
||||
add $2*32, SRND
|
||||
cmp $4*4*32, SRND
|
||||
jb loop3
|
||||
|
||||
mov _CTX(%rsp), CTX
|
||||
mov _INP(%rsp), INP
|
||||
add $64, INP
|
||||
|
||||
addm (4*0)(CTX),a
|
||||
addm (4*1)(CTX),b
|
||||
addm (4*2)(CTX),c
|
||||
addm (4*3)(CTX),d
|
||||
addm (4*4)(CTX),e
|
||||
addm (4*5)(CTX),f
|
||||
addm (4*6)(CTX),g
|
||||
addm (4*7)(CTX),h
|
||||
|
||||
cmp _INP_END(%rsp), INP
|
||||
jb loop0
|
||||
ja done_hash
|
||||
|
||||
do_last_block:
|
||||
#### do last block
|
||||
lea K256(%rip), TBL
|
||||
|
||||
VMOVDQ 0*16(INP),XWORD0
|
||||
VMOVDQ 1*16(INP),XWORD1
|
||||
VMOVDQ 2*16(INP),XWORD2
|
||||
VMOVDQ 3*16(INP),XWORD3
|
||||
|
||||
vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
|
||||
jmp last_block_enter
|
||||
|
||||
only_one_block:
|
||||
|
||||
## load initial digest
|
||||
mov (4*0)(CTX),a
|
||||
mov (4*1)(CTX),b
|
||||
mov (4*2)(CTX),c
|
||||
mov (4*3)(CTX),d
|
||||
mov (4*4)(CTX),e
|
||||
mov (4*5)(CTX),f
|
||||
mov (4*6)(CTX),g
|
||||
mov (4*7)(CTX),h
|
||||
|
||||
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||
|
||||
mov CTX, _CTX(%rsp)
|
||||
jmp do_last_block
|
||||
|
||||
done_hash:
|
||||
|
||||
mov _RSP(%rsp), %rsp
|
||||
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbp
|
||||
popq %rbx
|
||||
ret
|
||||
ENDPROC(sha256_transform_rorx)
|
||||
|
||||
.data
|
||||
.align 64
|
||||
K256:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
# shuffle xBxA -> 00BA
|
||||
_SHUF_00BA:
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
# shuffle xDxC -> DC00
|
||||
_SHUF_DC00:
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
#endif
|
506
arch/x86/crypto/sha256-ssse3-asm.S
Normal file
506
arch/x86/crypto/sha256-ssse3-asm.S
Normal file
@ -0,0 +1,506 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
## assume buffers not aligned
|
||||
#define MOVDQ movdqu
|
||||
|
||||
################################ Define Macros
|
||||
|
||||
# addm [mem], reg
|
||||
# Add reg to mem using reg-mem add and store
|
||||
.macro addm p1 p2
|
||||
add \p1, \p2
|
||||
mov \p2, \p1
|
||||
.endm
|
||||
|
||||
################################
|
||||
|
||||
# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
|
||||
# Load xmm with mem and byte swap each dword
|
||||
.macro COPY_XMM_AND_BSWAP p1 p2 p3
|
||||
MOVDQ \p2, \p1
|
||||
pshufb \p3, \p1
|
||||
.endm
|
||||
|
||||
################################
|
||||
|
||||
X0 = %xmm4
|
||||
X1 = %xmm5
|
||||
X2 = %xmm6
|
||||
X3 = %xmm7
|
||||
|
||||
XTMP0 = %xmm0
|
||||
XTMP1 = %xmm1
|
||||
XTMP2 = %xmm2
|
||||
XTMP3 = %xmm3
|
||||
XTMP4 = %xmm8
|
||||
XFER = %xmm9
|
||||
|
||||
SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
|
||||
SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
|
||||
BYTE_FLIP_MASK = %xmm12
|
||||
|
||||
NUM_BLKS = %rdx # 3rd arg
|
||||
CTX = %rsi # 2nd arg
|
||||
INP = %rdi # 1st arg
|
||||
|
||||
SRND = %rdi # clobbers INP
|
||||
c = %ecx
|
||||
d = %r8d
|
||||
e = %edx
|
||||
TBL = %rbp
|
||||
a = %eax
|
||||
b = %ebx
|
||||
|
||||
f = %r9d
|
||||
g = %r10d
|
||||
h = %r11d
|
||||
|
||||
y0 = %r13d
|
||||
y1 = %r14d
|
||||
y2 = %r15d
|
||||
|
||||
|
||||
|
||||
_INP_END_SIZE = 8
|
||||
_INP_SIZE = 8
|
||||
_XFER_SIZE = 8
|
||||
_XMM_SAVE_SIZE = 0
|
||||
|
||||
_INP_END = 0
|
||||
_INP = _INP_END + _INP_END_SIZE
|
||||
_XFER = _INP + _INP_SIZE
|
||||
_XMM_SAVE = _XFER + _XFER_SIZE
|
||||
STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
|
||||
|
||||
# rotate_Xs
|
||||
# Rotate values of symbols X0...X3
|
||||
.macro rotate_Xs
|
||||
X_ = X0
|
||||
X0 = X1
|
||||
X1 = X2
|
||||
X2 = X3
|
||||
X3 = X_
|
||||
.endm
|
||||
|
||||
# ROTATE_ARGS
|
||||
# Rotate values of symbols a...h
|
||||
.macro ROTATE_ARGS
|
||||
TMP_ = h
|
||||
h = g
|
||||
g = f
|
||||
f = e
|
||||
e = d
|
||||
d = c
|
||||
c = b
|
||||
b = a
|
||||
a = TMP_
|
||||
.endm
|
||||
|
||||
.macro FOUR_ROUNDS_AND_SCHED
|
||||
## compute s0 four at a time and s1 two at a time
|
||||
## compute W[-16] + W[-7] 4 at a time
|
||||
movdqa X3, XTMP0
|
||||
mov e, y0 # y0 = e
|
||||
ror $(25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
palignr $4, X2, XTMP0 # XTMP0 = W[-7]
|
||||
ror $(22-13), y1 # y1 = a >> (22-13)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
movdqa X1, XTMP1
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
xor g, y2 # y2 = f^g
|
||||
paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
## compute s0
|
||||
palignr $4, X0, XTMP1 # XTMP1 = W[-15]
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
|
||||
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
|
||||
movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
pslld $(32-7), XTMP1 #
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
psrld $7, XTMP2 #
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
#
|
||||
ROTATE_ARGS #
|
||||
movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
|
||||
mov e, y0 # y0 = e
|
||||
mov a, y1 # y1 = a
|
||||
movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
|
||||
ror $(25-11), y0 # y0 = e >> (25-11)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
ror $(22-13), y1 # y1 = a >> (22-13)
|
||||
pslld $(32-18), XTMP3 #
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor g, y2 # y2 = f^g
|
||||
psrld $18, XTMP2 #
|
||||
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP3, XTMP1
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
pxor XTMP4, XTMP1 # XTMP1 = s0
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
## compute low s1
|
||||
pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
|
||||
mov e, y0 # y0 = e
|
||||
mov a, y1 # y1 = a
|
||||
ror $(25-11), y0 # y0 = e >> (25-11)
|
||||
movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
ror $(22-13), y1 # y1 = a >> (22-13)
|
||||
mov f, y2 # y2 = f
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
|
||||
xor g, y2 # y2 = f^g
|
||||
psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
|
||||
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
pxor XTMP3, XTMP2
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
## compute high s1
|
||||
pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
#
|
||||
ROTATE_ARGS #
|
||||
movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
|
||||
mov e, y0 # y0 = e
|
||||
ror $(25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
|
||||
ror $(22-13), y1 # y1 = a >> (22-13)
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
mov f, y2 # y2 = f
|
||||
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
xor g, y2 # y2 = f^g
|
||||
psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
|
||||
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
pxor XTMP3, XTMP2 #
|
||||
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
pxor XTMP2, X0 # X0 = s1 {xDxC}
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
|
||||
ROTATE_ARGS
|
||||
rotate_Xs
|
||||
.endm
|
||||
|
||||
## input is [rsp + _XFER + %1 * 4]
|
||||
.macro DO_ROUND round
|
||||
mov e, y0 # y0 = e
|
||||
ror $(25-11), y0 # y0 = e >> (25-11)
|
||||
mov a, y1 # y1 = a
|
||||
xor e, y0 # y0 = e ^ (e >> (25-11))
|
||||
ror $(22-13), y1 # y1 = a >> (22-13)
|
||||
mov f, y2 # y2 = f
|
||||
xor a, y1 # y1 = a ^ (a >> (22-13)
|
||||
ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
|
||||
xor g, y2 # y2 = f^g
|
||||
xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
|
||||
ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
|
||||
and e, y2 # y2 = (f^g)&e
|
||||
xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
|
||||
ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g
|
||||
add y0, y2 # y2 = S1 + CH
|
||||
ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
|
||||
offset = \round * 4 + _XFER
|
||||
add offset(%rsp), y2 # y2 = k + w + S1 + CH
|
||||
mov a, y0 # y0 = a
|
||||
add y2, h # h = h + S1 + CH + k + w
|
||||
mov a, y2 # y2 = a
|
||||
or c, y0 # y0 = a|c
|
||||
add h, d # d = d + h + S1 + CH + k + w
|
||||
and c, y2 # y2 = a&c
|
||||
and b, y0 # y0 = (a|c)&b
|
||||
add y1, h # h = h + S1 + CH + k + w + S0
|
||||
or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
|
||||
add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
|
||||
ROTATE_ARGS
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
|
||||
## arg 1 : pointer to input data
|
||||
## arg 2 : pointer to digest
|
||||
## arg 3 : Num blocks
|
||||
########################################################################
|
||||
.text
|
||||
ENTRY(sha256_transform_ssse3)
|
||||
.align 32
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
pushq %r12
|
||||
|
||||
mov %rsp, %r12
|
||||
subq $STACK_SIZE, %rsp
|
||||
and $~15, %rsp
|
||||
|
||||
shl $6, NUM_BLKS # convert to bytes
|
||||
jz done_hash
|
||||
add INP, NUM_BLKS
|
||||
mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
|
||||
|
||||
## load initial digest
|
||||
mov 4*0(CTX), a
|
||||
mov 4*1(CTX), b
|
||||
mov 4*2(CTX), c
|
||||
mov 4*3(CTX), d
|
||||
mov 4*4(CTX), e
|
||||
mov 4*5(CTX), f
|
||||
mov 4*6(CTX), g
|
||||
mov 4*7(CTX), h
|
||||
|
||||
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||
movdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||
movdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||
|
||||
loop0:
|
||||
lea K256(%rip), TBL
|
||||
|
||||
## byte swap first 16 dwords
|
||||
COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
|
||||
COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
|
||||
|
||||
mov INP, _INP(%rsp)
|
||||
|
||||
## schedule 48 input dwords, by doing 3 rounds of 16 each
|
||||
mov $3, SRND
|
||||
.align 16
|
||||
loop1:
|
||||
movdqa (TBL), XFER
|
||||
paddd X0, XFER
|
||||
movdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa 1*16(TBL), XFER
|
||||
paddd X0, XFER
|
||||
movdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa 2*16(TBL), XFER
|
||||
paddd X0, XFER
|
||||
movdqa XFER, _XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
movdqa 3*16(TBL), XFER
|
||||
paddd X0, XFER
|
||||
movdqa XFER, _XFER(%rsp)
|
||||
add $4*16, TBL
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
sub $1, SRND
|
||||
jne loop1
|
||||
|
||||
mov $2, SRND
|
||||
loop2:
|
||||
paddd (TBL), X0
|
||||
movdqa X0, _XFER(%rsp)
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
paddd 1*16(TBL), X1
|
||||
movdqa X1, _XFER(%rsp)
|
||||
add $2*16, TBL
|
||||
DO_ROUND 0
|
||||
DO_ROUND 1
|
||||
DO_ROUND 2
|
||||
DO_ROUND 3
|
||||
|
||||
movdqa X2, X0
|
||||
movdqa X3, X1
|
||||
|
||||
sub $1, SRND
|
||||
jne loop2
|
||||
|
||||
addm (4*0)(CTX),a
|
||||
addm (4*1)(CTX),b
|
||||
addm (4*2)(CTX),c
|
||||
addm (4*3)(CTX),d
|
||||
addm (4*4)(CTX),e
|
||||
addm (4*5)(CTX),f
|
||||
addm (4*6)(CTX),g
|
||||
addm (4*7)(CTX),h
|
||||
|
||||
mov _INP(%rsp), INP
|
||||
add $64, INP
|
||||
cmp _INP_END(%rsp), INP
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
|
||||
mov %r12, %rsp
|
||||
|
||||
popq %r12
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %rbp
|
||||
popq %rbx
|
||||
|
||||
ret
|
||||
ENDPROC(sha256_transform_ssse3)
|
||||
|
||||
.data
|
||||
.align 64
|
||||
K256:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
# shuffle xBxA -> 00BA
|
||||
_SHUF_00BA:
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
# shuffle xDxC -> DC00
|
||||
_SHUF_DC00:
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
275
arch/x86/crypto/sha256_ssse3_glue.c
Normal file
275
arch/x86/crypto/sha256_ssse3_glue.c
Normal file
@ -0,0 +1,275 @@
|
||||
/*
|
||||
* Cryptographic API.
|
||||
*
|
||||
* Glue code for the SHA256 Secure Hash Algorithm assembler
|
||||
* implementation using supplemental SSE3 / AVX / AVX2 instructions.
|
||||
*
|
||||
* This file is based on sha256_generic.c
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation.
|
||||
*
|
||||
* Author:
|
||||
* Tim Chen <tim.c.chen@linux.intel.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/cryptohash.h>
|
||||
#include <linux/types.h>
|
||||
#include <crypto/sha.h>
|
||||
#include <asm/byteorder.h>
|
||||
#include <asm/i387.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
|
||||
u64 rounds);
|
||||
#ifdef CONFIG_AS_AVX
|
||||
asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
|
||||
u64 rounds);
|
||||
#endif
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
|
||||
u64 rounds);
|
||||
#endif
|
||||
|
||||
static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
|
||||
|
||||
|
||||
static int sha256_ssse3_init(struct shash_desc *desc)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
sctx->state[0] = SHA256_H0;
|
||||
sctx->state[1] = SHA256_H1;
|
||||
sctx->state[2] = SHA256_H2;
|
||||
sctx->state[3] = SHA256_H3;
|
||||
sctx->state[4] = SHA256_H4;
|
||||
sctx->state[5] = SHA256_H5;
|
||||
sctx->state[6] = SHA256_H6;
|
||||
sctx->state[7] = SHA256_H7;
|
||||
sctx->count = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, unsigned int partial)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int done = 0;
|
||||
|
||||
sctx->count += len;
|
||||
|
||||
if (partial) {
|
||||
done = SHA256_BLOCK_SIZE - partial;
|
||||
memcpy(sctx->buf + partial, data, done);
|
||||
sha256_transform_asm(sctx->buf, sctx->state, 1);
|
||||
}
|
||||
|
||||
if (len - done >= SHA256_BLOCK_SIZE) {
|
||||
const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
|
||||
|
||||
sha256_transform_asm(data + done, sctx->state, (u64) rounds);
|
||||
|
||||
done += rounds * SHA256_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
memcpy(sctx->buf, data + done, len - done);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
|
||||
int res;
|
||||
|
||||
/* Handle the fast case right here */
|
||||
if (partial + len < SHA256_BLOCK_SIZE) {
|
||||
sctx->count += len;
|
||||
memcpy(sctx->buf + partial, data, len);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!irq_fpu_usable()) {
|
||||
res = crypto_sha256_update(desc, data, len);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
res = __sha256_ssse3_update(desc, data, len, partial);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/* Add padding and return the message digest. */
|
||||
static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int i, index, padlen;
|
||||
__be32 *dst = (__be32 *)out;
|
||||
__be64 bits;
|
||||
static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
|
||||
|
||||
bits = cpu_to_be64(sctx->count << 3);
|
||||
|
||||
/* Pad out to 56 mod 64 and append length */
|
||||
index = sctx->count % SHA256_BLOCK_SIZE;
|
||||
padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
|
||||
|
||||
if (!irq_fpu_usable()) {
|
||||
crypto_sha256_update(desc, padding, padlen);
|
||||
crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
/* We need to fill a whole block for __sha256_ssse3_update() */
|
||||
if (padlen <= 56) {
|
||||
sctx->count += padlen;
|
||||
memcpy(sctx->buf + index, padding, padlen);
|
||||
} else {
|
||||
__sha256_ssse3_update(desc, padding, padlen, index);
|
||||
}
|
||||
__sha256_ssse3_update(desc, (const u8 *)&bits,
|
||||
sizeof(bits), 56);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
/* Store state in digest */
|
||||
for (i = 0; i < 8; i++)
|
||||
dst[i] = cpu_to_be32(sctx->state[i]);
|
||||
|
||||
/* Wipe context */
|
||||
memset(sctx, 0, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_ssse3_export(struct shash_desc *desc, void *out)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
memcpy(out, sctx, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
memcpy(sctx, in, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.digestsize = SHA256_DIGEST_SIZE,
|
||||
.init = sha256_ssse3_init,
|
||||
.update = sha256_ssse3_update,
|
||||
.final = sha256_ssse3_final,
|
||||
.export = sha256_ssse3_export,
|
||||
.import = sha256_ssse3_import,
|
||||
.descsize = sizeof(struct sha256_state),
|
||||
.statesize = sizeof(struct sha256_state),
|
||||
.base = {
|
||||
.cra_name = "sha256",
|
||||
.cra_driver_name = "sha256-ssse3",
|
||||
.cra_priority = 150,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = SHA256_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
static bool __init avx_usable(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx || !cpu_has_osxsave)
|
||||
return false;
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX detected but unusable.\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int __init sha256_ssse3_mod_init(void)
|
||||
{
|
||||
/* test for SSE3 first */
|
||||
if (cpu_has_ssse3)
|
||||
sha256_transform_asm = sha256_transform_ssse3;
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
/* allow AVX to override SSSE3, it's a little faster */
|
||||
if (avx_usable()) {
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (boot_cpu_has(X86_FEATURE_AVX2))
|
||||
sha256_transform_asm = sha256_transform_rorx;
|
||||
else
|
||||
#endif
|
||||
sha256_transform_asm = sha256_transform_avx;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (sha256_transform_asm) {
|
||||
#ifdef CONFIG_AS_AVX
|
||||
if (sha256_transform_asm == sha256_transform_avx)
|
||||
pr_info("Using AVX optimized SHA-256 implementation\n");
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
else if (sha256_transform_asm == sha256_transform_rorx)
|
||||
pr_info("Using AVX2 optimized SHA-256 implementation\n");
|
||||
#endif
|
||||
else
|
||||
#endif
|
||||
pr_info("Using SSSE3 optimized SHA-256 implementation\n");
|
||||
return crypto_register_shash(&alg);
|
||||
}
|
||||
pr_info("Neither AVX nor SSSE3 is available/usable.\n");
|
||||
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static void __exit sha256_ssse3_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_shash(&alg);
|
||||
}
|
||||
|
||||
module_init(sha256_ssse3_mod_init);
|
||||
module_exit(sha256_ssse3_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
|
||||
|
||||
MODULE_ALIAS("sha256");
|
423
arch/x86/crypto/sha512-avx-asm.S
Normal file
423
arch/x86/crypto/sha512-avx-asm.S
Normal file
@ -0,0 +1,423 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-512 with AVX instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# David Cote <david.m.cote@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-512 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
|
||||
# Virtual Registers
|
||||
# ARG1
|
||||
msg = %rdi
|
||||
# ARG2
|
||||
digest = %rsi
|
||||
# ARG3
|
||||
msglen = %rdx
|
||||
T1 = %rcx
|
||||
T2 = %r8
|
||||
a_64 = %r9
|
||||
b_64 = %r10
|
||||
c_64 = %r11
|
||||
d_64 = %r12
|
||||
e_64 = %r13
|
||||
f_64 = %r14
|
||||
g_64 = %r15
|
||||
h_64 = %rbx
|
||||
tmp0 = %rax
|
||||
|
||||
# Local variables (stack frame)
|
||||
|
||||
# Message Schedule
|
||||
W_SIZE = 80*8
|
||||
# W[t] + K[t] | W[t+1] + K[t+1]
|
||||
WK_SIZE = 2*8
|
||||
RSPSAVE_SIZE = 1*8
|
||||
GPRSAVE_SIZE = 5*8
|
||||
|
||||
frame_W = 0
|
||||
frame_WK = frame_W + W_SIZE
|
||||
frame_RSPSAVE = frame_WK + WK_SIZE
|
||||
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
|
||||
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
|
||||
|
||||
# Useful QWORD "arrays" for simpler memory references
|
||||
# MSG, DIGEST, K_t, W_t are arrays
|
||||
# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
|
||||
|
||||
# Input message (arg1)
|
||||
#define MSG(i) 8*i(msg)
|
||||
|
||||
# Output Digest (arg2)
|
||||
#define DIGEST(i) 8*i(digest)
|
||||
|
||||
# SHA Constants (static mem)
|
||||
#define K_t(i) 8*i+K512(%rip)
|
||||
|
||||
# Message Schedule (stack frame)
|
||||
#define W_t(i) 8*i+frame_W(%rsp)
|
||||
|
||||
# W[t]+K[t] (stack frame)
|
||||
#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
|
||||
|
||||
.macro RotateState
|
||||
# Rotate symbols a..h right
|
||||
TMP = h_64
|
||||
h_64 = g_64
|
||||
g_64 = f_64
|
||||
f_64 = e_64
|
||||
e_64 = d_64
|
||||
d_64 = c_64
|
||||
c_64 = b_64
|
||||
b_64 = a_64
|
||||
a_64 = TMP
|
||||
.endm
|
||||
|
||||
.macro RORQ p1 p2
|
||||
# shld is faster than ror on Sandybridge
|
||||
shld $(64-\p2), \p1, \p1
|
||||
.endm
|
||||
|
||||
.macro SHA512_Round rnd
|
||||
# Compute Round %%t
|
||||
mov f_64, T1 # T1 = f
|
||||
mov e_64, tmp0 # tmp = e
|
||||
xor g_64, T1 # T1 = f ^ g
|
||||
RORQ tmp0, 23 # 41 # tmp = e ror 23
|
||||
and e_64, T1 # T1 = (f ^ g) & e
|
||||
xor e_64, tmp0 # tmp = (e ror 23) ^ e
|
||||
xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
|
||||
idx = \rnd
|
||||
add WK_2(idx), T1 # W[t] + K[t] from message scheduler
|
||||
RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
|
||||
xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
|
||||
mov a_64, T2 # T2 = a
|
||||
add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
|
||||
RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
|
||||
add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
|
||||
mov a_64, tmp0 # tmp = a
|
||||
xor c_64, T2 # T2 = a ^ c
|
||||
and c_64, tmp0 # tmp = a & c
|
||||
and b_64, T2 # T2 = (a ^ c) & b
|
||||
xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
|
||||
mov a_64, tmp0 # tmp = a
|
||||
RORQ tmp0, 5 # 39 # tmp = a ror 5
|
||||
xor a_64, tmp0 # tmp = (a ror 5) ^ a
|
||||
add T1, d_64 # e(next_state) = d + T1
|
||||
RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
|
||||
xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
|
||||
lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
|
||||
RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
|
||||
add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
|
||||
RotateState
|
||||
.endm
|
||||
|
||||
.macro SHA512_2Sched_2Round_avx rnd
|
||||
# Compute rounds t-2 and t-1
|
||||
# Compute message schedule QWORDS t and t+1
|
||||
|
||||
# Two rounds are computed based on the values for K[t-2]+W[t-2] and
|
||||
# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
|
||||
# scheduler.
|
||||
# The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
|
||||
# They are then added to their respective SHA512 constants at
|
||||
# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
|
||||
# For brievity, the comments following vectored instructions only refer to
|
||||
# the first of a pair of QWORDS.
|
||||
# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
|
||||
# The computation of the message schedule and the rounds are tightly
|
||||
# stitched to take advantage of instruction-level parallelism.
|
||||
|
||||
idx = \rnd - 2
|
||||
vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
|
||||
idx = \rnd - 15
|
||||
vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
|
||||
mov f_64, T1
|
||||
vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
|
||||
mov e_64, tmp0
|
||||
vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
|
||||
xor g_64, T1
|
||||
RORQ tmp0, 23 # 41
|
||||
vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
|
||||
and e_64, T1
|
||||
xor e_64, tmp0
|
||||
vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
|
||||
xor g_64, T1
|
||||
idx = \rnd
|
||||
add WK_2(idx), T1#
|
||||
vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
|
||||
RORQ tmp0, 4 # 18
|
||||
vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
|
||||
xor e_64, tmp0
|
||||
mov a_64, T2
|
||||
add h_64, T1
|
||||
vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
|
||||
RORQ tmp0, 14 # 14
|
||||
add tmp0, T1
|
||||
vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
|
||||
mov a_64, tmp0
|
||||
xor c_64, T2
|
||||
vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
|
||||
and c_64, tmp0
|
||||
and b_64, T2
|
||||
vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
|
||||
xor tmp0, T2
|
||||
mov a_64, tmp0
|
||||
vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
|
||||
RORQ tmp0, 5 # 39
|
||||
vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
|
||||
xor a_64, tmp0
|
||||
add T1, d_64
|
||||
RORQ tmp0, 6 # 34
|
||||
xor a_64, tmp0
|
||||
vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
|
||||
# W[t-15]>>7 ^ W[t-15]<<63
|
||||
lea (T1, T2), h_64
|
||||
RORQ tmp0, 28 # 28
|
||||
vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
|
||||
add tmp0, h_64
|
||||
RotateState
|
||||
vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
|
||||
# W[t-2]<<25
|
||||
mov f_64, T1
|
||||
vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
|
||||
mov e_64, tmp0
|
||||
xor g_64, T1
|
||||
idx = \rnd - 16
|
||||
vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
|
||||
idx = \rnd - 7
|
||||
vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
|
||||
RORQ tmp0, 23 # 41
|
||||
and e_64, T1
|
||||
xor e_64, tmp0
|
||||
xor g_64, T1
|
||||
vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
|
||||
idx = \rnd + 1
|
||||
add WK_2(idx), T1
|
||||
vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
|
||||
RORQ tmp0, 4 # 18
|
||||
vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
|
||||
xor e_64, tmp0
|
||||
vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
|
||||
# s0(W[t-15]) + W[t-16]
|
||||
mov a_64, T2
|
||||
add h_64, T1
|
||||
RORQ tmp0, 14 # 14
|
||||
add tmp0, T1
|
||||
idx = \rnd
|
||||
vmovdqa %xmm0, W_t(idx) # Store W[t]
|
||||
vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
|
||||
vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
|
||||
mov a_64, tmp0
|
||||
xor c_64, T2
|
||||
and c_64, tmp0
|
||||
and b_64, T2
|
||||
xor tmp0, T2
|
||||
mov a_64, tmp0
|
||||
RORQ tmp0, 5 # 39
|
||||
xor a_64, tmp0
|
||||
add T1, d_64
|
||||
RORQ tmp0, 6 # 34
|
||||
xor a_64, tmp0
|
||||
lea (T1, T2), h_64
|
||||
RORQ tmp0, 28 # 28
|
||||
add tmp0, h_64
|
||||
RotateState
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
# void sha512_transform_avx(const void* M, void* D, u64 L)
|
||||
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
|
||||
# The size of the message pointed to by M must be an integer multiple of SHA512
|
||||
# message blocks.
|
||||
# L is the message length in SHA512 blocks
|
||||
########################################################################
|
||||
ENTRY(sha512_transform_avx)
|
||||
cmp $0, msglen
|
||||
je nowork
|
||||
|
||||
# Allocate Stack Space
|
||||
mov %rsp, %rax
|
||||
sub $frame_size, %rsp
|
||||
and $~(0x20 - 1), %rsp
|
||||
mov %rax, frame_RSPSAVE(%rsp)
|
||||
|
||||
# Save GPRs
|
||||
mov %rbx, frame_GPRSAVE(%rsp)
|
||||
mov %r12, frame_GPRSAVE +8*1(%rsp)
|
||||
mov %r13, frame_GPRSAVE +8*2(%rsp)
|
||||
mov %r14, frame_GPRSAVE +8*3(%rsp)
|
||||
mov %r15, frame_GPRSAVE +8*4(%rsp)
|
||||
|
||||
updateblock:
|
||||
|
||||
# Load state variables
|
||||
mov DIGEST(0), a_64
|
||||
mov DIGEST(1), b_64
|
||||
mov DIGEST(2), c_64
|
||||
mov DIGEST(3), d_64
|
||||
mov DIGEST(4), e_64
|
||||
mov DIGEST(5), f_64
|
||||
mov DIGEST(6), g_64
|
||||
mov DIGEST(7), h_64
|
||||
|
||||
t = 0
|
||||
.rept 80/2 + 1
|
||||
# (80 rounds) / (2 rounds/iteration) + (1 iteration)
|
||||
# +1 iteration because the scheduler leads hashing by 1 iteration
|
||||
.if t < 2
|
||||
# BSWAP 2 QWORDS
|
||||
vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
|
||||
vmovdqu MSG(t), %xmm0
|
||||
vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
|
||||
vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
|
||||
vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
|
||||
vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
|
||||
.elseif t < 16
|
||||
# BSWAP 2 QWORDS# Compute 2 Rounds
|
||||
vmovdqu MSG(t), %xmm0
|
||||
vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
|
||||
SHA512_Round t-2 # Round t-2
|
||||
vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
|
||||
vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
|
||||
SHA512_Round t-1 # Round t-1
|
||||
vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
|
||||
.elseif t < 79
|
||||
# Schedule 2 QWORDS# Compute 2 Rounds
|
||||
SHA512_2Sched_2Round_avx t
|
||||
.else
|
||||
# Compute 2 Rounds
|
||||
SHA512_Round t-2
|
||||
SHA512_Round t-1
|
||||
.endif
|
||||
t = t+2
|
||||
.endr
|
||||
|
||||
# Update digest
|
||||
add a_64, DIGEST(0)
|
||||
add b_64, DIGEST(1)
|
||||
add c_64, DIGEST(2)
|
||||
add d_64, DIGEST(3)
|
||||
add e_64, DIGEST(4)
|
||||
add f_64, DIGEST(5)
|
||||
add g_64, DIGEST(6)
|
||||
add h_64, DIGEST(7)
|
||||
|
||||
# Advance to next message block
|
||||
add $16*8, msg
|
||||
dec msglen
|
||||
jnz updateblock
|
||||
|
||||
# Restore GPRs
|
||||
mov frame_GPRSAVE(%rsp), %rbx
|
||||
mov frame_GPRSAVE +8*1(%rsp), %r12
|
||||
mov frame_GPRSAVE +8*2(%rsp), %r13
|
||||
mov frame_GPRSAVE +8*3(%rsp), %r14
|
||||
mov frame_GPRSAVE +8*4(%rsp), %r15
|
||||
|
||||
# Restore Stack Pointer
|
||||
mov frame_RSPSAVE(%rsp), %rsp
|
||||
|
||||
nowork:
|
||||
ret
|
||||
ENDPROC(sha512_transform_avx)
|
||||
|
||||
########################################################################
|
||||
### Binary Data
|
||||
|
||||
.data
|
||||
|
||||
.align 16
|
||||
|
||||
# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||
XMM_QWORD_BSWAP:
|
||||
.octa 0x08090a0b0c0d0e0f0001020304050607
|
||||
|
||||
# K[t] used in SHA512 hashing
|
||||
K512:
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
#endif
|
743
arch/x86/crypto/sha512-avx2-asm.S
Normal file
743
arch/x86/crypto/sha512-avx2-asm.S
Normal file
@ -0,0 +1,743 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-512 with AVX2 instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# David Cote <david.m.cote@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-512 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
# This code schedules 1 blocks at a time, with 4 lanes per block
|
||||
########################################################################
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
|
||||
# Virtual Registers
|
||||
Y_0 = %ymm4
|
||||
Y_1 = %ymm5
|
||||
Y_2 = %ymm6
|
||||
Y_3 = %ymm7
|
||||
|
||||
YTMP0 = %ymm0
|
||||
YTMP1 = %ymm1
|
||||
YTMP2 = %ymm2
|
||||
YTMP3 = %ymm3
|
||||
YTMP4 = %ymm8
|
||||
XFER = YTMP0
|
||||
|
||||
BYTE_FLIP_MASK = %ymm9
|
||||
|
||||
# 1st arg
|
||||
INP = %rdi
|
||||
# 2nd arg
|
||||
CTX = %rsi
|
||||
# 3rd arg
|
||||
NUM_BLKS = %rdx
|
||||
|
||||
c = %rcx
|
||||
d = %r8
|
||||
e = %rdx
|
||||
y3 = %rdi
|
||||
|
||||
TBL = %rbp
|
||||
|
||||
a = %rax
|
||||
b = %rbx
|
||||
|
||||
f = %r9
|
||||
g = %r10
|
||||
h = %r11
|
||||
old_h = %r11
|
||||
|
||||
T1 = %r12
|
||||
y0 = %r13
|
||||
y1 = %r14
|
||||
y2 = %r15
|
||||
|
||||
y4 = %r12
|
||||
|
||||
# Local variables (stack frame)
|
||||
XFER_SIZE = 4*8
|
||||
SRND_SIZE = 1*8
|
||||
INP_SIZE = 1*8
|
||||
INPEND_SIZE = 1*8
|
||||
RSPSAVE_SIZE = 1*8
|
||||
GPRSAVE_SIZE = 6*8
|
||||
|
||||
frame_XFER = 0
|
||||
frame_SRND = frame_XFER + XFER_SIZE
|
||||
frame_INP = frame_SRND + SRND_SIZE
|
||||
frame_INPEND = frame_INP + INP_SIZE
|
||||
frame_RSPSAVE = frame_INPEND + INPEND_SIZE
|
||||
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
|
||||
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
|
||||
|
||||
## assume buffers not aligned
|
||||
#define VMOVDQ vmovdqu
|
||||
|
||||
# addm [mem], reg
|
||||
# Add reg to mem using reg-mem add and store
|
||||
.macro addm p1 p2
|
||||
add \p1, \p2
|
||||
mov \p2, \p1
|
||||
.endm
|
||||
|
||||
|
||||
# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
|
||||
# Load ymm with mem and byte swap each dword
|
||||
.macro COPY_YMM_AND_BSWAP p1 p2 p3
|
||||
VMOVDQ \p2, \p1
|
||||
vpshufb \p3, \p1, \p1
|
||||
.endm
|
||||
# rotate_Ys
|
||||
# Rotate values of symbols Y0...Y3
|
||||
.macro rotate_Ys
|
||||
Y_ = Y_0
|
||||
Y_0 = Y_1
|
||||
Y_1 = Y_2
|
||||
Y_2 = Y_3
|
||||
Y_3 = Y_
|
||||
.endm
|
||||
|
||||
# RotateState
|
||||
.macro RotateState
|
||||
# Rotate symbols a..h right
|
||||
old_h = h
|
||||
TMP_ = h
|
||||
h = g
|
||||
g = f
|
||||
f = e
|
||||
e = d
|
||||
d = c
|
||||
c = b
|
||||
b = a
|
||||
a = TMP_
|
||||
.endm
|
||||
|
||||
# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
|
||||
# YDST = {YSRC1, YSRC2} >> RVAL*8
|
||||
.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
|
||||
vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
|
||||
vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
|
||||
.endm
|
||||
|
||||
.macro FOUR_ROUNDS_AND_SCHED
|
||||
################################### RND N + 0 #########################################
|
||||
|
||||
# Extract w[t-7]
|
||||
MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
|
||||
# Calculate w[t-16] + w[t-7]
|
||||
vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
|
||||
# Extract w[t-15]
|
||||
MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
|
||||
|
||||
# Calculate sigma0
|
||||
|
||||
# Calculate w[t-15] ror 1
|
||||
vpsrlq $1, YTMP1, YTMP2
|
||||
vpsllq $(64-1), YTMP1, YTMP3
|
||||
vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
|
||||
# Calculate w[t-15] shr 7
|
||||
vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
add frame_XFER(%rsp),h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
add h, d # d = k + w + h + d # --
|
||||
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
RotateState
|
||||
|
||||
################################### RND N + 1 #########################################
|
||||
|
||||
# Calculate w[t-15] ror 8
|
||||
vpsrlq $8, YTMP1, YTMP2
|
||||
vpsllq $(64-8), YTMP1, YTMP1
|
||||
vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
|
||||
# XOR the three components
|
||||
vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
|
||||
vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
|
||||
|
||||
|
||||
# Add three components, w[t-16], w[t-7] and sigma0
|
||||
vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
|
||||
# Move to appropriate lanes for calculating w[16] and w[17]
|
||||
vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
|
||||
# Move to appropriate lanes for calculating w[18] and w[19]
|
||||
vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
|
||||
|
||||
# Calculate w[16] and w[17] in both 128 bit lanes
|
||||
|
||||
# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
|
||||
vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
|
||||
vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
|
||||
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add h, d # d = k + w + h + d # --
|
||||
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
RotateState
|
||||
|
||||
|
||||
################################### RND N + 2 #########################################
|
||||
|
||||
vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
|
||||
vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
|
||||
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
|
||||
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
|
||||
vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
|
||||
vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
|
||||
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
|
||||
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
|
||||
# (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
|
||||
|
||||
# Add sigma1 to the other compunents to get w[16] and w[17]
|
||||
vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
|
||||
|
||||
# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
|
||||
vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
mov f, y2 # y2 = f # CH
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
add h, d # d = k + w + h + d # --
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
RotateState
|
||||
|
||||
################################### RND N + 3 #########################################
|
||||
|
||||
vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
|
||||
vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
|
||||
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
|
||||
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
|
||||
vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
|
||||
vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
|
||||
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
|
||||
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
|
||||
# (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
|
||||
|
||||
# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
|
||||
# to newly calculated sigma1 to get w[18] and w[19]
|
||||
vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
|
||||
|
||||
# Form w[19, w[18], w17], w[16]
|
||||
vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
|
||||
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add h, d # d = k + w + h + d # --
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
RotateState
|
||||
|
||||
rotate_Ys
|
||||
.endm
|
||||
|
||||
.macro DO_4ROUNDS
|
||||
|
||||
################################### RND N + 0 #########################################
|
||||
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
add frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
RotateState
|
||||
|
||||
################################### RND N + 1 #########################################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
RotateState
|
||||
|
||||
################################### RND N + 2 #########################################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
RotateState
|
||||
|
||||
################################### RND N + 3 #########################################
|
||||
|
||||
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
mov f, y2 # y2 = f # CH
|
||||
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||
xor g, y2 # y2 = f^g # CH
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||
and e, y2 # y2 = (f^g)&e # CH
|
||||
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||
|
||||
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||
mov a, y3 # y3 = a # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||
add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
|
||||
or c, y3 # y3 = a|c # MAJA
|
||||
|
||||
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||
mov a, T1 # T1 = a # MAJB
|
||||
and b, y3 # y3 = (a|c)&b # MAJA
|
||||
and c, T1 # T1 = a&c # MAJB
|
||||
add y0, y2 # y2 = S1 + CH # --
|
||||
|
||||
|
||||
add h, d # d = k + w + h + d # --
|
||||
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||
add y1, h # h = k + w + h + S0 # --
|
||||
|
||||
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||
|
||||
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||
|
||||
add y3, h # h = t1 + S0 + MAJ # --
|
||||
|
||||
RotateState
|
||||
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
|
||||
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
|
||||
# The size of the message pointed to by M must be an integer multiple of SHA512
|
||||
# message blocks.
|
||||
# L is the message length in SHA512 blocks
|
||||
########################################################################
|
||||
ENTRY(sha512_transform_rorx)
|
||||
# Allocate Stack Space
|
||||
mov %rsp, %rax
|
||||
sub $frame_size, %rsp
|
||||
and $~(0x20 - 1), %rsp
|
||||
mov %rax, frame_RSPSAVE(%rsp)
|
||||
|
||||
# Save GPRs
|
||||
mov %rbp, frame_GPRSAVE(%rsp)
|
||||
mov %rbx, 8*1+frame_GPRSAVE(%rsp)
|
||||
mov %r12, 8*2+frame_GPRSAVE(%rsp)
|
||||
mov %r13, 8*3+frame_GPRSAVE(%rsp)
|
||||
mov %r14, 8*4+frame_GPRSAVE(%rsp)
|
||||
mov %r15, 8*5+frame_GPRSAVE(%rsp)
|
||||
|
||||
shl $7, NUM_BLKS # convert to bytes
|
||||
jz done_hash
|
||||
add INP, NUM_BLKS # pointer to end of data
|
||||
mov NUM_BLKS, frame_INPEND(%rsp)
|
||||
|
||||
## load initial digest
|
||||
mov 8*0(CTX),a
|
||||
mov 8*1(CTX),b
|
||||
mov 8*2(CTX),c
|
||||
mov 8*3(CTX),d
|
||||
mov 8*4(CTX),e
|
||||
mov 8*5(CTX),f
|
||||
mov 8*6(CTX),g
|
||||
mov 8*7(CTX),h
|
||||
|
||||
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||
|
||||
loop0:
|
||||
lea K512(%rip), TBL
|
||||
|
||||
## byte swap first 16 dwords
|
||||
COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
|
||||
COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
|
||||
COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
|
||||
COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
|
||||
|
||||
mov INP, frame_INP(%rsp)
|
||||
|
||||
## schedule 64 input dwords, by doing 12 rounds of 4 each
|
||||
movq $4, frame_SRND(%rsp)
|
||||
|
||||
.align 16
|
||||
loop1:
|
||||
vpaddq (TBL), Y_0, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddq 1*32(TBL), Y_0, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddq 2*32(TBL), Y_0, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
vpaddq 3*32(TBL), Y_0, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
add $(4*32), TBL
|
||||
FOUR_ROUNDS_AND_SCHED
|
||||
|
||||
subq $1, frame_SRND(%rsp)
|
||||
jne loop1
|
||||
|
||||
movq $2, frame_SRND(%rsp)
|
||||
loop2:
|
||||
vpaddq (TBL), Y_0, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
DO_4ROUNDS
|
||||
vpaddq 1*32(TBL), Y_1, XFER
|
||||
vmovdqa XFER, frame_XFER(%rsp)
|
||||
add $(2*32), TBL
|
||||
DO_4ROUNDS
|
||||
|
||||
vmovdqa Y_2, Y_0
|
||||
vmovdqa Y_3, Y_1
|
||||
|
||||
subq $1, frame_SRND(%rsp)
|
||||
jne loop2
|
||||
|
||||
addm 8*0(CTX),a
|
||||
addm 8*1(CTX),b
|
||||
addm 8*2(CTX),c
|
||||
addm 8*3(CTX),d
|
||||
addm 8*4(CTX),e
|
||||
addm 8*5(CTX),f
|
||||
addm 8*6(CTX),g
|
||||
addm 8*7(CTX),h
|
||||
|
||||
mov frame_INP(%rsp), INP
|
||||
add $128, INP
|
||||
cmp frame_INPEND(%rsp), INP
|
||||
jne loop0
|
||||
|
||||
done_hash:
|
||||
|
||||
# Restore GPRs
|
||||
mov frame_GPRSAVE(%rsp) ,%rbp
|
||||
mov 8*1+frame_GPRSAVE(%rsp) ,%rbx
|
||||
mov 8*2+frame_GPRSAVE(%rsp) ,%r12
|
||||
mov 8*3+frame_GPRSAVE(%rsp) ,%r13
|
||||
mov 8*4+frame_GPRSAVE(%rsp) ,%r14
|
||||
mov 8*5+frame_GPRSAVE(%rsp) ,%r15
|
||||
|
||||
# Restore Stack Pointer
|
||||
mov frame_RSPSAVE(%rsp), %rsp
|
||||
ret
|
||||
ENDPROC(sha512_transform_rorx)
|
||||
|
||||
########################################################################
|
||||
### Binary Data
|
||||
|
||||
.data
|
||||
|
||||
.align 64
|
||||
# K[t] used in SHA512 hashing
|
||||
K512:
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
|
||||
.align 32
|
||||
|
||||
# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x08090a0b0c0d0e0f0001020304050607
|
||||
.octa 0x18191a1b1c1d1e1f1011121314151617
|
||||
|
||||
MASK_YMM_LO:
|
||||
.octa 0x00000000000000000000000000000000
|
||||
.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
|
||||
#endif
|
421
arch/x86/crypto/sha512-ssse3-asm.S
Normal file
421
arch/x86/crypto/sha512-ssse3-asm.S
Normal file
@ -0,0 +1,421 @@
|
||||
########################################################################
|
||||
# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
|
||||
#
|
||||
# Copyright (C) 2013 Intel Corporation.
|
||||
#
|
||||
# Authors:
|
||||
# James Guilford <james.guilford@intel.com>
|
||||
# Kirk Yap <kirk.s.yap@intel.com>
|
||||
# David Cote <david.m.cote@intel.com>
|
||||
# Tim Chen <tim.c.chen@linux.intel.com>
|
||||
#
|
||||
# This software is available to you under a choice of one of two
|
||||
# licenses. You may choose to be licensed under the terms of the GNU
|
||||
# General Public License (GPL) Version 2, available from the file
|
||||
# COPYING in the main directory of this source tree, or the
|
||||
# OpenIB.org BSD license below:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or
|
||||
# without modification, are permitted provided that the following
|
||||
# conditions are met:
|
||||
#
|
||||
# - Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer.
|
||||
#
|
||||
# - Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This code is described in an Intel White-Paper:
|
||||
# "Fast SHA-512 Implementations on Intel Architecture Processors"
|
||||
#
|
||||
# To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||
# and search for that title.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
|
||||
# Virtual Registers
|
||||
# ARG1
|
||||
msg = %rdi
|
||||
# ARG2
|
||||
digest = %rsi
|
||||
# ARG3
|
||||
msglen = %rdx
|
||||
T1 = %rcx
|
||||
T2 = %r8
|
||||
a_64 = %r9
|
||||
b_64 = %r10
|
||||
c_64 = %r11
|
||||
d_64 = %r12
|
||||
e_64 = %r13
|
||||
f_64 = %r14
|
||||
g_64 = %r15
|
||||
h_64 = %rbx
|
||||
tmp0 = %rax
|
||||
|
||||
# Local variables (stack frame)
|
||||
|
||||
W_SIZE = 80*8
|
||||
WK_SIZE = 2*8
|
||||
RSPSAVE_SIZE = 1*8
|
||||
GPRSAVE_SIZE = 5*8
|
||||
|
||||
frame_W = 0
|
||||
frame_WK = frame_W + W_SIZE
|
||||
frame_RSPSAVE = frame_WK + WK_SIZE
|
||||
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
|
||||
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
|
||||
|
||||
# Useful QWORD "arrays" for simpler memory references
|
||||
# MSG, DIGEST, K_t, W_t are arrays
|
||||
# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
|
||||
|
||||
# Input message (arg1)
|
||||
#define MSG(i) 8*i(msg)
|
||||
|
||||
# Output Digest (arg2)
|
||||
#define DIGEST(i) 8*i(digest)
|
||||
|
||||
# SHA Constants (static mem)
|
||||
#define K_t(i) 8*i+K512(%rip)
|
||||
|
||||
# Message Schedule (stack frame)
|
||||
#define W_t(i) 8*i+frame_W(%rsp)
|
||||
|
||||
# W[t]+K[t] (stack frame)
|
||||
#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
|
||||
|
||||
.macro RotateState
|
||||
# Rotate symbols a..h right
|
||||
TMP = h_64
|
||||
h_64 = g_64
|
||||
g_64 = f_64
|
||||
f_64 = e_64
|
||||
e_64 = d_64
|
||||
d_64 = c_64
|
||||
c_64 = b_64
|
||||
b_64 = a_64
|
||||
a_64 = TMP
|
||||
.endm
|
||||
|
||||
.macro SHA512_Round rnd
|
||||
|
||||
# Compute Round %%t
|
||||
mov f_64, T1 # T1 = f
|
||||
mov e_64, tmp0 # tmp = e
|
||||
xor g_64, T1 # T1 = f ^ g
|
||||
ror $23, tmp0 # 41 # tmp = e ror 23
|
||||
and e_64, T1 # T1 = (f ^ g) & e
|
||||
xor e_64, tmp0 # tmp = (e ror 23) ^ e
|
||||
xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
|
||||
idx = \rnd
|
||||
add WK_2(idx), T1 # W[t] + K[t] from message scheduler
|
||||
ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
|
||||
xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
|
||||
mov a_64, T2 # T2 = a
|
||||
add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
|
||||
ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
|
||||
add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
|
||||
mov a_64, tmp0 # tmp = a
|
||||
xor c_64, T2 # T2 = a ^ c
|
||||
and c_64, tmp0 # tmp = a & c
|
||||
and b_64, T2 # T2 = (a ^ c) & b
|
||||
xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
|
||||
mov a_64, tmp0 # tmp = a
|
||||
ror $5, tmp0 # 39 # tmp = a ror 5
|
||||
xor a_64, tmp0 # tmp = (a ror 5) ^ a
|
||||
add T1, d_64 # e(next_state) = d + T1
|
||||
ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
|
||||
xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
|
||||
lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
|
||||
ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
|
||||
add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
|
||||
RotateState
|
||||
.endm
|
||||
|
||||
.macro SHA512_2Sched_2Round_sse rnd
|
||||
|
||||
# Compute rounds t-2 and t-1
|
||||
# Compute message schedule QWORDS t and t+1
|
||||
|
||||
# Two rounds are computed based on the values for K[t-2]+W[t-2] and
|
||||
# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
|
||||
# scheduler.
|
||||
# The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
|
||||
# They are then added to their respective SHA512 constants at
|
||||
# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
|
||||
# For brievity, the comments following vectored instructions only refer to
|
||||
# the first of a pair of QWORDS.
|
||||
# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
|
||||
# The computation of the message schedule and the rounds are tightly
|
||||
# stitched to take advantage of instruction-level parallelism.
|
||||
# For clarity, integer instructions (for the rounds calculation) are indented
|
||||
# by one tab. Vectored instructions (for the message scheduler) are indented
|
||||
# by two tabs.
|
||||
|
||||
mov f_64, T1
|
||||
idx = \rnd -2
|
||||
movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
|
||||
xor g_64, T1
|
||||
and e_64, T1
|
||||
movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
|
||||
xor g_64, T1
|
||||
idx = \rnd
|
||||
add WK_2(idx), T1
|
||||
idx = \rnd - 15
|
||||
movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
|
||||
mov e_64, tmp0
|
||||
ror $23, tmp0 # 41
|
||||
movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
|
||||
xor e_64, tmp0
|
||||
ror $4, tmp0 # 18
|
||||
psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
|
||||
xor e_64, tmp0
|
||||
ror $14, tmp0 # 14
|
||||
psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
|
||||
add tmp0, T1
|
||||
add h_64, T1
|
||||
pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
|
||||
mov a_64, T2
|
||||
xor c_64, T2
|
||||
pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
|
||||
and b_64, T2
|
||||
mov a_64, tmp0
|
||||
psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
|
||||
and c_64, tmp0
|
||||
xor tmp0, T2
|
||||
psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
|
||||
mov a_64, tmp0
|
||||
ror $5, tmp0 # 39
|
||||
pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
|
||||
xor a_64, tmp0
|
||||
ror $6, tmp0 # 34
|
||||
pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
|
||||
xor a_64, tmp0
|
||||
ror $28, tmp0 # 28
|
||||
psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
|
||||
add tmp0, T2
|
||||
add T1, d_64
|
||||
psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
|
||||
lea (T1, T2), h_64
|
||||
RotateState
|
||||
movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
|
||||
mov f_64, T1
|
||||
xor g_64, T1
|
||||
movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
|
||||
and e_64, T1
|
||||
xor g_64, T1
|
||||
psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
|
||||
idx = \rnd + 1
|
||||
add WK_2(idx), T1
|
||||
mov e_64, tmp0
|
||||
psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
|
||||
ror $23, tmp0 # 41
|
||||
xor e_64, tmp0
|
||||
pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
|
||||
ror $4, tmp0 # 18
|
||||
xor e_64, tmp0
|
||||
pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
|
||||
ror $14, tmp0 # 14
|
||||
add tmp0, T1
|
||||
psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
|
||||
add h_64, T1
|
||||
mov a_64, T2
|
||||
psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
|
||||
xor c_64, T2
|
||||
and b_64, T2
|
||||
pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
|
||||
mov a_64, tmp0
|
||||
and c_64, tmp0
|
||||
idx = \rnd - 7
|
||||
movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
|
||||
xor tmp0, T2
|
||||
pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
|
||||
mov a_64, tmp0
|
||||
paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
|
||||
ror $5, tmp0 # 39
|
||||
idx =\rnd-16
|
||||
paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
|
||||
xor a_64, tmp0
|
||||
paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
|
||||
ror $6, tmp0 # 34
|
||||
movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
|
||||
xor a_64, tmp0
|
||||
paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
|
||||
ror $28, tmp0 # 28
|
||||
idx = \rnd
|
||||
movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
|
||||
add tmp0, T2
|
||||
add T1, d_64
|
||||
lea (T1, T2), h_64
|
||||
RotateState
|
||||
.endm
|
||||
|
||||
########################################################################
|
||||
# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
|
||||
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
|
||||
# The size of the message pointed to by M must be an integer multiple of SHA512
|
||||
# message blocks.
|
||||
# L is the message length in SHA512 blocks.
|
||||
########################################################################
|
||||
ENTRY(sha512_transform_ssse3)
|
||||
|
||||
cmp $0, msglen
|
||||
je nowork
|
||||
|
||||
# Allocate Stack Space
|
||||
mov %rsp, %rax
|
||||
sub $frame_size, %rsp
|
||||
and $~(0x20 - 1), %rsp
|
||||
mov %rax, frame_RSPSAVE(%rsp)
|
||||
|
||||
# Save GPRs
|
||||
mov %rbx, frame_GPRSAVE(%rsp)
|
||||
mov %r12, frame_GPRSAVE +8*1(%rsp)
|
||||
mov %r13, frame_GPRSAVE +8*2(%rsp)
|
||||
mov %r14, frame_GPRSAVE +8*3(%rsp)
|
||||
mov %r15, frame_GPRSAVE +8*4(%rsp)
|
||||
|
||||
updateblock:
|
||||
|
||||
# Load state variables
|
||||
mov DIGEST(0), a_64
|
||||
mov DIGEST(1), b_64
|
||||
mov DIGEST(2), c_64
|
||||
mov DIGEST(3), d_64
|
||||
mov DIGEST(4), e_64
|
||||
mov DIGEST(5), f_64
|
||||
mov DIGEST(6), g_64
|
||||
mov DIGEST(7), h_64
|
||||
|
||||
t = 0
|
||||
.rept 80/2 + 1
|
||||
# (80 rounds) / (2 rounds/iteration) + (1 iteration)
|
||||
# +1 iteration because the scheduler leads hashing by 1 iteration
|
||||
.if t < 2
|
||||
# BSWAP 2 QWORDS
|
||||
movdqa XMM_QWORD_BSWAP(%rip), %xmm1
|
||||
movdqu MSG(t), %xmm0
|
||||
pshufb %xmm1, %xmm0 # BSWAP
|
||||
movdqa %xmm0, W_t(t) # Store Scheduled Pair
|
||||
paddq K_t(t), %xmm0 # Compute W[t]+K[t]
|
||||
movdqa %xmm0, WK_2(t) # Store into WK for rounds
|
||||
.elseif t < 16
|
||||
# BSWAP 2 QWORDS# Compute 2 Rounds
|
||||
movdqu MSG(t), %xmm0
|
||||
pshufb %xmm1, %xmm0 # BSWAP
|
||||
SHA512_Round t-2 # Round t-2
|
||||
movdqa %xmm0, W_t(t) # Store Scheduled Pair
|
||||
paddq K_t(t), %xmm0 # Compute W[t]+K[t]
|
||||
SHA512_Round t-1 # Round t-1
|
||||
movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
|
||||
.elseif t < 79
|
||||
# Schedule 2 QWORDS# Compute 2 Rounds
|
||||
SHA512_2Sched_2Round_sse t
|
||||
.else
|
||||
# Compute 2 Rounds
|
||||
SHA512_Round t-2
|
||||
SHA512_Round t-1
|
||||
.endif
|
||||
t = t+2
|
||||
.endr
|
||||
|
||||
# Update digest
|
||||
add a_64, DIGEST(0)
|
||||
add b_64, DIGEST(1)
|
||||
add c_64, DIGEST(2)
|
||||
add d_64, DIGEST(3)
|
||||
add e_64, DIGEST(4)
|
||||
add f_64, DIGEST(5)
|
||||
add g_64, DIGEST(6)
|
||||
add h_64, DIGEST(7)
|
||||
|
||||
# Advance to next message block
|
||||
add $16*8, msg
|
||||
dec msglen
|
||||
jnz updateblock
|
||||
|
||||
# Restore GPRs
|
||||
mov frame_GPRSAVE(%rsp), %rbx
|
||||
mov frame_GPRSAVE +8*1(%rsp), %r12
|
||||
mov frame_GPRSAVE +8*2(%rsp), %r13
|
||||
mov frame_GPRSAVE +8*3(%rsp), %r14
|
||||
mov frame_GPRSAVE +8*4(%rsp), %r15
|
||||
|
||||
# Restore Stack Pointer
|
||||
mov frame_RSPSAVE(%rsp), %rsp
|
||||
|
||||
nowork:
|
||||
ret
|
||||
ENDPROC(sha512_transform_ssse3)
|
||||
|
||||
########################################################################
|
||||
### Binary Data
|
||||
|
||||
.data
|
||||
|
||||
.align 16
|
||||
|
||||
# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||
XMM_QWORD_BSWAP:
|
||||
.octa 0x08090a0b0c0d0e0f0001020304050607
|
||||
|
||||
# K[t] used in SHA512 hashing
|
||||
K512:
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
282
arch/x86/crypto/sha512_ssse3_glue.c
Normal file
282
arch/x86/crypto/sha512_ssse3_glue.c
Normal file
@ -0,0 +1,282 @@
|
||||
/*
|
||||
* Cryptographic API.
|
||||
*
|
||||
* Glue code for the SHA512 Secure Hash Algorithm assembler
|
||||
* implementation using supplemental SSE3 / AVX / AVX2 instructions.
|
||||
*
|
||||
* This file is based on sha512_generic.c
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation
|
||||
* Author: Tim Chen <tim.c.chen@linux.intel.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/cryptohash.h>
|
||||
#include <linux/types.h>
|
||||
#include <crypto/sha.h>
|
||||
#include <asm/byteorder.h>
|
||||
#include <asm/i387.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
|
||||
#include <linux/string.h>
|
||||
|
||||
asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
|
||||
u64 rounds);
|
||||
#ifdef CONFIG_AS_AVX
|
||||
asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
|
||||
u64 rounds);
|
||||
#endif
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
|
||||
u64 rounds);
|
||||
#endif
|
||||
|
||||
static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
|
||||
|
||||
|
||||
static int sha512_ssse3_init(struct shash_desc *desc)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
sctx->state[0] = SHA512_H0;
|
||||
sctx->state[1] = SHA512_H1;
|
||||
sctx->state[2] = SHA512_H2;
|
||||
sctx->state[3] = SHA512_H3;
|
||||
sctx->state[4] = SHA512_H4;
|
||||
sctx->state[5] = SHA512_H5;
|
||||
sctx->state[6] = SHA512_H6;
|
||||
sctx->state[7] = SHA512_H7;
|
||||
sctx->count[0] = sctx->count[1] = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, unsigned int partial)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int done = 0;
|
||||
|
||||
sctx->count[0] += len;
|
||||
if (sctx->count[0] < len)
|
||||
sctx->count[1]++;
|
||||
|
||||
if (partial) {
|
||||
done = SHA512_BLOCK_SIZE - partial;
|
||||
memcpy(sctx->buf + partial, data, done);
|
||||
sha512_transform_asm(sctx->buf, sctx->state, 1);
|
||||
}
|
||||
|
||||
if (len - done >= SHA512_BLOCK_SIZE) {
|
||||
const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
|
||||
|
||||
sha512_transform_asm(data + done, sctx->state, (u64) rounds);
|
||||
|
||||
done += rounds * SHA512_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
memcpy(sctx->buf, data + done, len - done);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
|
||||
int res;
|
||||
|
||||
/* Handle the fast case right here */
|
||||
if (partial + len < SHA512_BLOCK_SIZE) {
|
||||
sctx->count[0] += len;
|
||||
if (sctx->count[0] < len)
|
||||
sctx->count[1]++;
|
||||
memcpy(sctx->buf + partial, data, len);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!irq_fpu_usable()) {
|
||||
res = crypto_sha512_update(desc, data, len);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
res = __sha512_ssse3_update(desc, data, len, partial);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/* Add padding and return the message digest. */
|
||||
static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
unsigned int i, index, padlen;
|
||||
__be64 *dst = (__be64 *)out;
|
||||
__be64 bits[2];
|
||||
static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
|
||||
|
||||
/* save number of bits */
|
||||
bits[1] = cpu_to_be64(sctx->count[0] << 3);
|
||||
bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
|
||||
|
||||
/* Pad out to 112 mod 128 and append length */
|
||||
index = sctx->count[0] & 0x7f;
|
||||
padlen = (index < 112) ? (112 - index) : ((128+112) - index);
|
||||
|
||||
if (!irq_fpu_usable()) {
|
||||
crypto_sha512_update(desc, padding, padlen);
|
||||
crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
/* We need to fill a whole block for __sha512_ssse3_update() */
|
||||
if (padlen <= 112) {
|
||||
sctx->count[0] += padlen;
|
||||
if (sctx->count[0] < padlen)
|
||||
sctx->count[1]++;
|
||||
memcpy(sctx->buf + index, padding, padlen);
|
||||
} else {
|
||||
__sha512_ssse3_update(desc, padding, padlen, index);
|
||||
}
|
||||
__sha512_ssse3_update(desc, (const u8 *)&bits,
|
||||
sizeof(bits), 112);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
/* Store state in digest */
|
||||
for (i = 0; i < 8; i++)
|
||||
dst[i] = cpu_to_be64(sctx->state[i]);
|
||||
|
||||
/* Wipe context */
|
||||
memset(sctx, 0, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha512_ssse3_export(struct shash_desc *desc, void *out)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
memcpy(out, sctx, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
memcpy(sctx, in, sizeof(*sctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.digestsize = SHA512_DIGEST_SIZE,
|
||||
.init = sha512_ssse3_init,
|
||||
.update = sha512_ssse3_update,
|
||||
.final = sha512_ssse3_final,
|
||||
.export = sha512_ssse3_export,
|
||||
.import = sha512_ssse3_import,
|
||||
.descsize = sizeof(struct sha512_state),
|
||||
.statesize = sizeof(struct sha512_state),
|
||||
.base = {
|
||||
.cra_name = "sha512",
|
||||
.cra_driver_name = "sha512-ssse3",
|
||||
.cra_priority = 150,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
|
||||
.cra_blocksize = SHA512_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
static bool __init avx_usable(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx || !cpu_has_osxsave)
|
||||
return false;
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX detected but unusable.\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int __init sha512_ssse3_mod_init(void)
|
||||
{
|
||||
/* test for SSE3 first */
|
||||
if (cpu_has_ssse3)
|
||||
sha512_transform_asm = sha512_transform_ssse3;
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
/* allow AVX to override SSSE3, it's a little faster */
|
||||
if (avx_usable()) {
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (boot_cpu_has(X86_FEATURE_AVX2))
|
||||
sha512_transform_asm = sha512_transform_rorx;
|
||||
else
|
||||
#endif
|
||||
sha512_transform_asm = sha512_transform_avx;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (sha512_transform_asm) {
|
||||
#ifdef CONFIG_AS_AVX
|
||||
if (sha512_transform_asm == sha512_transform_avx)
|
||||
pr_info("Using AVX optimized SHA-512 implementation\n");
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
else if (sha512_transform_asm == sha512_transform_rorx)
|
||||
pr_info("Using AVX2 optimized SHA-512 implementation\n");
|
||||
#endif
|
||||
else
|
||||
#endif
|
||||
pr_info("Using SSSE3 optimized SHA-512 implementation\n");
|
||||
return crypto_register_shash(&alg);
|
||||
}
|
||||
pr_info("Neither AVX nor SSSE3 is available/usable.\n");
|
||||
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static void __exit sha512_ssse3_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_shash(&alg);
|
||||
}
|
||||
|
||||
module_init(sha512_ssse3_mod_init);
|
||||
module_exit(sha512_ssse3_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
|
||||
|
||||
MODULE_ALIAS("sha512");
|
@ -4,7 +4,7 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@ -33,6 +33,8 @@
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lxts_gf128mul_and_shl1_mask:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
.text
|
||||
|
||||
@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way)
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_ctr_8way)
|
||||
|
||||
ENTRY(twofish_xts_enc_8way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
|
||||
RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __twofish_enc_blk8;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_xts_enc_8way)
|
||||
|
||||
ENTRY(twofish_xts_dec_8way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
|
||||
load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
|
||||
RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
|
||||
|
||||
call __twofish_dec_blk8;
|
||||
|
||||
/* dst <= regs xor IVs(in dst) */
|
||||
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_xts_dec_8way)
|
||||
|
600
arch/x86/crypto/twofish-avx2-asm_64.S
Normal file
600
arch/x86/crypto/twofish-avx2-asm_64.S
Normal file
@ -0,0 +1,600 @@
|
||||
/*
|
||||
* x86_64/AVX2 assembler optimized version of Twofish
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include "glue_helper-asm-avx2.S"
|
||||
|
||||
.file "twofish-avx2-asm_64.S"
|
||||
|
||||
.data
|
||||
.align 16
|
||||
|
||||
.Lvpshufb_mask0:
|
||||
.long 0x80808000
|
||||
.long 0x80808004
|
||||
.long 0x80808008
|
||||
.long 0x8080800c
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lxts_gf128mul_and_shl1_mask_0:
|
||||
.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
||||
.Lxts_gf128mul_and_shl1_mask_1:
|
||||
.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
.text
|
||||
|
||||
/* structure of crypto context */
|
||||
#define s0 0
|
||||
#define s1 1024
|
||||
#define s2 2048
|
||||
#define s3 3072
|
||||
#define w 4096
|
||||
#define k 4128
|
||||
|
||||
/* register macros */
|
||||
#define CTX %rdi
|
||||
|
||||
#define RS0 CTX
|
||||
#define RS1 %r8
|
||||
#define RS2 %r9
|
||||
#define RS3 %r10
|
||||
#define RK %r11
|
||||
#define RW %rax
|
||||
#define RROUND %r12
|
||||
#define RROUNDd %r12d
|
||||
|
||||
#define RA0 %ymm8
|
||||
#define RB0 %ymm9
|
||||
#define RC0 %ymm10
|
||||
#define RD0 %ymm11
|
||||
#define RA1 %ymm12
|
||||
#define RB1 %ymm13
|
||||
#define RC1 %ymm14
|
||||
#define RD1 %ymm15
|
||||
|
||||
/* temp regs */
|
||||
#define RX0 %ymm0
|
||||
#define RY0 %ymm1
|
||||
#define RX1 %ymm2
|
||||
#define RY1 %ymm3
|
||||
#define RT0 %ymm4
|
||||
#define RIDX %ymm5
|
||||
|
||||
#define RX0x %xmm0
|
||||
#define RY0x %xmm1
|
||||
#define RX1x %xmm2
|
||||
#define RY1x %xmm3
|
||||
#define RT0x %xmm4
|
||||
|
||||
/* vpgatherdd mask and '-1' */
|
||||
#define RNOT %ymm6
|
||||
|
||||
/* byte mask, (-1 >> 24) */
|
||||
#define RBYTE %ymm7
|
||||
|
||||
/**********************************************************************
|
||||
16-way AVX2 twofish
|
||||
**********************************************************************/
|
||||
#define init_round_constants() \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpsrld $24, RNOT, RBYTE; \
|
||||
leaq k(CTX), RK; \
|
||||
leaq w(CTX), RW; \
|
||||
leaq s1(CTX), RS1; \
|
||||
leaq s2(CTX), RS2; \
|
||||
leaq s3(CTX), RS3; \
|
||||
|
||||
#define g16(ab, rs0, rs1, rs2, rs3, xy) \
|
||||
vpand RBYTE, ab ## 0, RIDX; \
|
||||
vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
\
|
||||
vpand RBYTE, ab ## 1, RIDX; \
|
||||
vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
\
|
||||
vpsrld $8, ab ## 0, RIDX; \
|
||||
vpand RBYTE, RIDX, RIDX; \
|
||||
vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 0, xy ## 0; \
|
||||
\
|
||||
vpsrld $8, ab ## 1, RIDX; \
|
||||
vpand RBYTE, RIDX, RIDX; \
|
||||
vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 1, xy ## 1; \
|
||||
\
|
||||
vpsrld $16, ab ## 0, RIDX; \
|
||||
vpand RBYTE, RIDX, RIDX; \
|
||||
vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 0, xy ## 0; \
|
||||
\
|
||||
vpsrld $16, ab ## 1, RIDX; \
|
||||
vpand RBYTE, RIDX, RIDX; \
|
||||
vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 1, xy ## 1; \
|
||||
\
|
||||
vpsrld $24, ab ## 0, RIDX; \
|
||||
vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 0, xy ## 0; \
|
||||
\
|
||||
vpsrld $24, ab ## 1, RIDX; \
|
||||
vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
|
||||
vpcmpeqd RNOT, RNOT, RNOT; \
|
||||
vpxor RT0, xy ## 1, xy ## 1;
|
||||
|
||||
#define g1_16(a, x) \
|
||||
g16(a, RS0, RS1, RS2, RS3, x);
|
||||
|
||||
#define g2_16(b, y) \
|
||||
g16(b, RS1, RS2, RS3, RS0, y);
|
||||
|
||||
#define encrypt_round_end16(a, b, c, d, nk) \
|
||||
vpaddd RY0, RX0, RX0; \
|
||||
vpaddd RX0, RY0, RY0; \
|
||||
vpbroadcastd nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RX0, RX0; \
|
||||
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RY0, RY0; \
|
||||
\
|
||||
vpxor RY0, d ## 0, d ## 0; \
|
||||
\
|
||||
vpxor RX0, c ## 0, c ## 0; \
|
||||
vpsrld $1, c ## 0, RT0; \
|
||||
vpslld $31, c ## 0, c ## 0; \
|
||||
vpor RT0, c ## 0, c ## 0; \
|
||||
\
|
||||
vpaddd RY1, RX1, RX1; \
|
||||
vpaddd RX1, RY1, RY1; \
|
||||
vpbroadcastd nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RX1, RX1; \
|
||||
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RY1, RY1; \
|
||||
\
|
||||
vpxor RY1, d ## 1, d ## 1; \
|
||||
\
|
||||
vpxor RX1, c ## 1, c ## 1; \
|
||||
vpsrld $1, c ## 1, RT0; \
|
||||
vpslld $31, c ## 1, c ## 1; \
|
||||
vpor RT0, c ## 1, c ## 1; \
|
||||
|
||||
#define encrypt_round16(a, b, c, d, nk) \
|
||||
g2_16(b, RY); \
|
||||
\
|
||||
vpslld $1, b ## 0, RT0; \
|
||||
vpsrld $31, b ## 0, b ## 0; \
|
||||
vpor RT0, b ## 0, b ## 0; \
|
||||
\
|
||||
vpslld $1, b ## 1, RT0; \
|
||||
vpsrld $31, b ## 1, b ## 1; \
|
||||
vpor RT0, b ## 1, b ## 1; \
|
||||
\
|
||||
g1_16(a, RX); \
|
||||
\
|
||||
encrypt_round_end16(a, b, c, d, nk);
|
||||
|
||||
#define encrypt_round_first16(a, b, c, d, nk) \
|
||||
vpslld $1, d ## 0, RT0; \
|
||||
vpsrld $31, d ## 0, d ## 0; \
|
||||
vpor RT0, d ## 0, d ## 0; \
|
||||
\
|
||||
vpslld $1, d ## 1, RT0; \
|
||||
vpsrld $31, d ## 1, d ## 1; \
|
||||
vpor RT0, d ## 1, d ## 1; \
|
||||
\
|
||||
encrypt_round16(a, b, c, d, nk);
|
||||
|
||||
#define encrypt_round_last16(a, b, c, d, nk) \
|
||||
g2_16(b, RY); \
|
||||
\
|
||||
g1_16(a, RX); \
|
||||
\
|
||||
encrypt_round_end16(a, b, c, d, nk);
|
||||
|
||||
#define decrypt_round_end16(a, b, c, d, nk) \
|
||||
vpaddd RY0, RX0, RX0; \
|
||||
vpaddd RX0, RY0, RY0; \
|
||||
vpbroadcastd nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RX0, RX0; \
|
||||
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RY0, RY0; \
|
||||
\
|
||||
vpxor RX0, c ## 0, c ## 0; \
|
||||
\
|
||||
vpxor RY0, d ## 0, d ## 0; \
|
||||
vpsrld $1, d ## 0, RT0; \
|
||||
vpslld $31, d ## 0, d ## 0; \
|
||||
vpor RT0, d ## 0, d ## 0; \
|
||||
\
|
||||
vpaddd RY1, RX1, RX1; \
|
||||
vpaddd RX1, RY1, RY1; \
|
||||
vpbroadcastd nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RX1, RX1; \
|
||||
vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
|
||||
vpaddd RT0, RY1, RY1; \
|
||||
\
|
||||
vpxor RX1, c ## 1, c ## 1; \
|
||||
\
|
||||
vpxor RY1, d ## 1, d ## 1; \
|
||||
vpsrld $1, d ## 1, RT0; \
|
||||
vpslld $31, d ## 1, d ## 1; \
|
||||
vpor RT0, d ## 1, d ## 1;
|
||||
|
||||
#define decrypt_round16(a, b, c, d, nk) \
|
||||
g1_16(a, RX); \
|
||||
\
|
||||
vpslld $1, a ## 0, RT0; \
|
||||
vpsrld $31, a ## 0, a ## 0; \
|
||||
vpor RT0, a ## 0, a ## 0; \
|
||||
\
|
||||
vpslld $1, a ## 1, RT0; \
|
||||
vpsrld $31, a ## 1, a ## 1; \
|
||||
vpor RT0, a ## 1, a ## 1; \
|
||||
\
|
||||
g2_16(b, RY); \
|
||||
\
|
||||
decrypt_round_end16(a, b, c, d, nk);
|
||||
|
||||
#define decrypt_round_first16(a, b, c, d, nk) \
|
||||
vpslld $1, c ## 0, RT0; \
|
||||
vpsrld $31, c ## 0, c ## 0; \
|
||||
vpor RT0, c ## 0, c ## 0; \
|
||||
\
|
||||
vpslld $1, c ## 1, RT0; \
|
||||
vpsrld $31, c ## 1, c ## 1; \
|
||||
vpor RT0, c ## 1, c ## 1; \
|
||||
\
|
||||
decrypt_round16(a, b, c, d, nk)
|
||||
|
||||
#define decrypt_round_last16(a, b, c, d, nk) \
|
||||
g1_16(a, RX); \
|
||||
\
|
||||
g2_16(b, RY); \
|
||||
\
|
||||
decrypt_round_end16(a, b, c, d, nk);
|
||||
|
||||
#define encrypt_cycle16() \
|
||||
encrypt_round16(RA, RB, RC, RD, 0); \
|
||||
encrypt_round16(RC, RD, RA, RB, 8);
|
||||
|
||||
#define encrypt_cycle_first16() \
|
||||
encrypt_round_first16(RA, RB, RC, RD, 0); \
|
||||
encrypt_round16(RC, RD, RA, RB, 8);
|
||||
|
||||
#define encrypt_cycle_last16() \
|
||||
encrypt_round16(RA, RB, RC, RD, 0); \
|
||||
encrypt_round_last16(RC, RD, RA, RB, 8);
|
||||
|
||||
#define decrypt_cycle16(n) \
|
||||
decrypt_round16(RC, RD, RA, RB, 8); \
|
||||
decrypt_round16(RA, RB, RC, RD, 0);
|
||||
|
||||
#define decrypt_cycle_first16(n) \
|
||||
decrypt_round_first16(RC, RD, RA, RB, 8); \
|
||||
decrypt_round16(RA, RB, RC, RD, 0);
|
||||
|
||||
#define decrypt_cycle_last16(n) \
|
||||
decrypt_round16(RC, RD, RA, RB, 8); \
|
||||
decrypt_round_last16(RA, RB, RC, RD, 0);
|
||||
|
||||
#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
|
||||
vpunpckhdq x1, x0, t2; \
|
||||
vpunpckldq x1, x0, x0; \
|
||||
\
|
||||
vpunpckldq x3, x2, t1; \
|
||||
vpunpckhdq x3, x2, x2; \
|
||||
\
|
||||
vpunpckhqdq t1, x0, x1; \
|
||||
vpunpcklqdq t1, x0, x0; \
|
||||
\
|
||||
vpunpckhqdq x2, t2, x3; \
|
||||
vpunpcklqdq x2, t2, x2;
|
||||
|
||||
#define read_blocks8(offs,a,b,c,d) \
|
||||
transpose_4x4(a, b, c, d, RX0, RY0);
|
||||
|
||||
#define write_blocks8(offs,a,b,c,d) \
|
||||
transpose_4x4(a, b, c, d, RX0, RY0);
|
||||
|
||||
#define inpack_enc8(a,b,c,d) \
|
||||
vpbroadcastd 4*0(RW), RT0; \
|
||||
vpxor RT0, a, a; \
|
||||
\
|
||||
vpbroadcastd 4*1(RW), RT0; \
|
||||
vpxor RT0, b, b; \
|
||||
\
|
||||
vpbroadcastd 4*2(RW), RT0; \
|
||||
vpxor RT0, c, c; \
|
||||
\
|
||||
vpbroadcastd 4*3(RW), RT0; \
|
||||
vpxor RT0, d, d;
|
||||
|
||||
#define outunpack_enc8(a,b,c,d) \
|
||||
vpbroadcastd 4*4(RW), RX0; \
|
||||
vpbroadcastd 4*5(RW), RY0; \
|
||||
vpxor RX0, c, RX0; \
|
||||
vpxor RY0, d, RY0; \
|
||||
\
|
||||
vpbroadcastd 4*6(RW), RT0; \
|
||||
vpxor RT0, a, c; \
|
||||
vpbroadcastd 4*7(RW), RT0; \
|
||||
vpxor RT0, b, d; \
|
||||
\
|
||||
vmovdqa RX0, a; \
|
||||
vmovdqa RY0, b;
|
||||
|
||||
#define inpack_dec8(a,b,c,d) \
|
||||
vpbroadcastd 4*4(RW), RX0; \
|
||||
vpbroadcastd 4*5(RW), RY0; \
|
||||
vpxor RX0, a, RX0; \
|
||||
vpxor RY0, b, RY0; \
|
||||
\
|
||||
vpbroadcastd 4*6(RW), RT0; \
|
||||
vpxor RT0, c, a; \
|
||||
vpbroadcastd 4*7(RW), RT0; \
|
||||
vpxor RT0, d, b; \
|
||||
\
|
||||
vmovdqa RX0, c; \
|
||||
vmovdqa RY0, d;
|
||||
|
||||
#define outunpack_dec8(a,b,c,d) \
|
||||
vpbroadcastd 4*0(RW), RT0; \
|
||||
vpxor RT0, a, a; \
|
||||
\
|
||||
vpbroadcastd 4*1(RW), RT0; \
|
||||
vpxor RT0, b, b; \
|
||||
\
|
||||
vpbroadcastd 4*2(RW), RT0; \
|
||||
vpxor RT0, c, c; \
|
||||
\
|
||||
vpbroadcastd 4*3(RW), RT0; \
|
||||
vpxor RT0, d, d;
|
||||
|
||||
#define read_blocks16(a,b,c,d) \
|
||||
read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define write_blocks16(a,b,c,d) \
|
||||
write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define xor_blocks16(a,b,c,d) \
|
||||
xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define inpack_enc16(a,b,c,d) \
|
||||
inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define outunpack_enc16(a,b,c,d) \
|
||||
outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define inpack_dec16(a,b,c,d) \
|
||||
inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
#define outunpack_dec16(a,b,c,d) \
|
||||
outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
|
||||
outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
|
||||
|
||||
.align 8
|
||||
__twofish_enc_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
|
||||
* output:
|
||||
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
|
||||
*/
|
||||
init_round_constants();
|
||||
|
||||
read_blocks16(RA, RB, RC, RD);
|
||||
inpack_enc16(RA, RB, RC, RD);
|
||||
|
||||
xorl RROUNDd, RROUNDd;
|
||||
encrypt_cycle_first16();
|
||||
movl $2, RROUNDd;
|
||||
|
||||
.align 4
|
||||
.L__enc_loop:
|
||||
encrypt_cycle16();
|
||||
|
||||
addl $2, RROUNDd;
|
||||
cmpl $14, RROUNDd;
|
||||
jne .L__enc_loop;
|
||||
|
||||
encrypt_cycle_last16();
|
||||
|
||||
outunpack_enc16(RA, RB, RC, RD);
|
||||
write_blocks16(RA, RB, RC, RD);
|
||||
|
||||
ret;
|
||||
ENDPROC(__twofish_enc_blk16)
|
||||
|
||||
.align 8
|
||||
__twofish_dec_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
|
||||
* output:
|
||||
* RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
|
||||
*/
|
||||
init_round_constants();
|
||||
|
||||
read_blocks16(RA, RB, RC, RD);
|
||||
inpack_dec16(RA, RB, RC, RD);
|
||||
|
||||
movl $14, RROUNDd;
|
||||
decrypt_cycle_first16();
|
||||
movl $12, RROUNDd;
|
||||
|
||||
.align 4
|
||||
.L__dec_loop:
|
||||
decrypt_cycle16();
|
||||
|
||||
addl $-2, RROUNDd;
|
||||
jnz .L__dec_loop;
|
||||
|
||||
decrypt_cycle_last16();
|
||||
|
||||
outunpack_dec16(RA, RB, RC, RD);
|
||||
write_blocks16(RA, RB, RC, RD);
|
||||
|
||||
ret;
|
||||
ENDPROC(__twofish_dec_blk16)
|
||||
|
||||
ENTRY(twofish_ecb_enc_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
pushq %r12;
|
||||
|
||||
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
call __twofish_enc_blk16;
|
||||
|
||||
store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
popq %r12;
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_ecb_enc_16way)
|
||||
|
||||
ENTRY(twofish_ecb_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
pushq %r12;
|
||||
|
||||
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
call __twofish_dec_blk16;
|
||||
|
||||
store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
popq %r12;
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_ecb_dec_16way)
|
||||
|
||||
ENTRY(twofish_cbc_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
pushq %r12;
|
||||
|
||||
load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
call __twofish_dec_blk16;
|
||||
|
||||
store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
|
||||
RX0);
|
||||
|
||||
popq %r12;
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_cbc_dec_16way)
|
||||
|
||||
ENTRY(twofish_ctr_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (little endian, 128bit)
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
pushq %r12;
|
||||
|
||||
load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
|
||||
RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
|
||||
RBYTE);
|
||||
|
||||
call __twofish_enc_blk16;
|
||||
|
||||
store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
popq %r12;
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_ctr_16way)
|
||||
|
||||
.align 8
|
||||
twofish_xts_crypt_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
* %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
|
||||
*/
|
||||
|
||||
vzeroupper;
|
||||
pushq %r12;
|
||||
|
||||
load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
|
||||
RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
|
||||
.Lxts_gf128mul_and_shl1_mask_0,
|
||||
.Lxts_gf128mul_and_shl1_mask_1);
|
||||
|
||||
call *%r8;
|
||||
|
||||
store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
|
||||
|
||||
popq %r12;
|
||||
vzeroupper;
|
||||
|
||||
ret;
|
||||
ENDPROC(twofish_xts_crypt_16way)
|
||||
|
||||
ENTRY(twofish_xts_enc_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
leaq __twofish_enc_blk16, %r8;
|
||||
jmp twofish_xts_crypt_16way;
|
||||
ENDPROC(twofish_xts_enc_16way)
|
||||
|
||||
ENTRY(twofish_xts_dec_16way)
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst (16 blocks)
|
||||
* %rdx: src (16 blocks)
|
||||
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
||||
*/
|
||||
leaq __twofish_dec_blk16, %r8;
|
||||
jmp twofish_xts_crypt_16way;
|
||||
ENDPROC(twofish_xts_dec_16way)
|
584
arch/x86/crypto/twofish_avx2_glue.c
Normal file
584
arch/x86/crypto/twofish_avx2_glue.c
Normal file
@ -0,0 +1,584 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX2 assembler optimized version of Twofish
|
||||
*
|
||||
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/err.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <crypto/twofish.h>
|
||||
#include <crypto/lrw.h>
|
||||
#include <crypto/xts.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <asm/crypto/twofish.h>
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
#include <crypto/scatterwalk.h>
|
||||
|
||||
#define TF_AVX2_PARALLEL_BLOCKS 16
|
||||
|
||||
/* 16-way AVX2 parallel cipher functions */
|
||||
asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
|
||||
|
||||
asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
|
||||
asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__twofish_enc_blk_3way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static const struct common_glue_ctx twofish_enc = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_ctr = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_enc_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_dec = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_dec_cbc = {
|
||||
.num_funcs = 4,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_dec_xts = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = 8,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = 16,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 8,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
|
||||
dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
|
||||
return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
|
||||
}
|
||||
|
||||
static inline void twofish_fpu_end(bool fpu_enabled)
|
||||
{
|
||||
glue_fpu_end(fpu_enabled);
|
||||
}
|
||||
|
||||
struct crypt_priv {
|
||||
struct twofish_ctx *ctx;
|
||||
bool fpu_enabled;
|
||||
};
|
||||
|
||||
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = TF_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= 8 * bsize) {
|
||||
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * 8;
|
||||
nbytes -= bsize * 8;
|
||||
}
|
||||
|
||||
while (nbytes >= 3 * bsize) {
|
||||
twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * 3;
|
||||
nbytes -= bsize * 3;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
twofish_enc_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = TF_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
|
||||
twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= 8 * bsize) {
|
||||
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * 8;
|
||||
nbytes -= bsize * 8;
|
||||
}
|
||||
|
||||
while (nbytes >= 3 * bsize) {
|
||||
twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * 3;
|
||||
nbytes -= bsize * 3;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
twofish_dec_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[TF_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->twofish_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
twofish_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[TF_AVX2_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->twofish_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
twofish_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
|
||||
return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg tf_algs[10] = { {
|
||||
.cra_name = "__ecb-twofish-avx2",
|
||||
.cra_driver_name = "__driver-ecb-twofish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct twofish_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.setkey = twofish_setkey,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-twofish-avx2",
|
||||
.cra_driver_name = "__driver-cbc-twofish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct twofish_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.setkey = twofish_setkey,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-twofish-avx2",
|
||||
.cra_driver_name = "__driver-ctr-twofish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct twofish_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = twofish_setkey,
|
||||
.encrypt = ctr_crypt,
|
||||
.decrypt = ctr_crypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__lrw-twofish-avx2",
|
||||
.cra_driver_name = "__driver-lrw-twofish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct twofish_lrw_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_exit = lrw_twofish_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE +
|
||||
TF_BLOCK_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE +
|
||||
TF_BLOCK_SIZE,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = lrw_twofish_setkey,
|
||||
.encrypt = lrw_encrypt,
|
||||
.decrypt = lrw_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__xts-twofish-avx2",
|
||||
.cra_driver_name = "__driver-xts-twofish-avx2",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct twofish_xts_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = TF_MAX_KEY_SIZE * 2,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = xts_twofish_setkey,
|
||||
.encrypt = xts_encrypt,
|
||||
.decrypt = xts_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(twofish)",
|
||||
.cra_driver_name = "ecb-twofish-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "cbc(twofish)",
|
||||
.cra_driver_name = "cbc-twofish-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = __ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ctr(twofish)",
|
||||
.cra_driver_name = "ctr-twofish-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_encrypt,
|
||||
.geniv = "chainiv",
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "lrw(twofish)",
|
||||
.cra_driver_name = "lrw-twofish-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE +
|
||||
TF_BLOCK_SIZE,
|
||||
.max_keysize = TF_MAX_KEY_SIZE +
|
||||
TF_BLOCK_SIZE,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "xts(twofish)",
|
||||
.cra_driver_name = "xts-twofish-avx2",
|
||||
.cra_priority = 500,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = TF_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = TF_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = TF_MAX_KEY_SIZE * 2,
|
||||
.ivsize = TF_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
} };
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx2 || !cpu_has_osxsave) {
|
||||
pr_info("AVX2 instructions are not detected.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX2 detected but unusable.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
|
||||
}
|
||||
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
|
||||
MODULE_ALIAS("twofish");
|
||||
MODULE_ALIAS("twofish-asm");
|
@ -4,6 +4,8 @@
|
||||
* Copyright (C) 2012 Johannes Goetzfried
|
||||
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
|
||||
*
|
||||
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -48,13 +50,26 @@
|
||||
/* 8-way parallel cipher functions */
|
||||
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
|
||||
|
||||
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
|
||||
|
||||
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
|
||||
|
||||
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(twofish_ctr_8way);
|
||||
|
||||
asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
|
||||
asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
|
||||
|
||||
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
@ -62,6 +77,20 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
__twofish_enc_blk_3way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(twofish_enc_blk));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(twofish_xts_enc);
|
||||
|
||||
void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
glue_xts_crypt_128bit_one(ctx, dst, src, iv,
|
||||
GLUE_FUNC_CAST(twofish_dec_blk));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(twofish_xts_dec);
|
||||
|
||||
|
||||
static const struct common_glue_ctx twofish_enc = {
|
||||
.num_funcs = 3,
|
||||
@ -95,6 +124,19 @@ static const struct common_glue_ctx twofish_ctr = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_enc_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_dec = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
|
||||
@ -127,6 +169,19 @@ static const struct common_glue_ctx twofish_dec_cbc = {
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx twofish_dec_xts = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
@ -275,54 +330,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[TWOFISH_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
twofish_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[TWOFISH_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
twofish_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
|
||||
XTS_TWEAK_CAST(twofish_enc_blk),
|
||||
&ctx->tweak_ctx, &ctx->crypt_ctx);
|
||||
}
|
||||
|
||||
static struct crypto_alg twofish_algs[10] = { {
|
||||
|
@ -293,6 +293,7 @@ extern const char * const x86_power_flags[32];
|
||||
#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
|
||||
#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
|
||||
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
|
||||
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
|
||||
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
|
||||
#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
|
||||
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
|
||||
|
43
arch/x86/include/asm/crypto/blowfish.h
Normal file
43
arch/x86/include/asm/crypto/blowfish.h
Normal file
@ -0,0 +1,43 @@
|
||||
#ifndef ASM_X86_BLOWFISH_H
|
||||
#define ASM_X86_BLOWFISH_H
|
||||
|
||||
#include <linux/crypto.h>
|
||||
#include <crypto/blowfish.h>
|
||||
|
||||
#define BF_PARALLEL_BLOCKS 4
|
||||
|
||||
/* regular block cipher functions */
|
||||
asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
|
||||
bool xor);
|
||||
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
|
||||
|
||||
/* 4-way parallel cipher functions */
|
||||
asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk_4way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__blowfish_enc_blk_4way(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
#endif
|
@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
/* 16-way parallel cipher functions (avx/aes-ni) */
|
||||
asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
|
||||
extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
|
||||
extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
|
||||
#endif /* ASM_X86_CAMELLIA_H */
|
||||
|
@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
|
||||
typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
|
||||
typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
|
||||
#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
|
||||
#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
|
||||
#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
|
||||
#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
|
||||
|
||||
struct common_glue_func_entry {
|
||||
unsigned int num_blocks; /* number of blocks that @fn will process */
|
||||
@ -25,6 +28,7 @@ struct common_glue_func_entry {
|
||||
common_glue_func_t ecb;
|
||||
common_glue_cbc_func_t cbc;
|
||||
common_glue_ctr_func_t ctr;
|
||||
common_glue_xts_func_t xts;
|
||||
} fn_u;
|
||||
};
|
||||
|
||||
@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i)
|
||||
i->b = cpu_to_le64(b);
|
||||
}
|
||||
|
||||
static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
|
||||
{
|
||||
u64 a = le64_to_cpu(src->a);
|
||||
u64 b = le64_to_cpu(src->b);
|
||||
u64 _tt = ((s64)a >> 63) & 0x87;
|
||||
|
||||
dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
|
||||
dst->b = cpu_to_le64((b << 1) ^ _tt);
|
||||
}
|
||||
|
||||
extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
struct blkcipher_desc *desc,
|
||||
struct scatterlist *dst,
|
||||
@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes);
|
||||
|
||||
extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
struct blkcipher_desc *desc,
|
||||
struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes,
|
||||
common_glue_func_t tweak_fn, void *tweak_ctx,
|
||||
void *crypt_ctx);
|
||||
|
||||
extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv, common_glue_func_t fn);
|
||||
|
||||
#endif /* _CRYPTO_GLUE_HELPER_H */
|
||||
|
@ -6,6 +6,16 @@
|
||||
|
||||
#define SERPENT_PARALLEL_BLOCKS 8
|
||||
|
||||
struct serpent_lrw_ctx {
|
||||
struct lrw_table_ctx lrw_table;
|
||||
struct serpent_ctx serpent_ctx;
|
||||
};
|
||||
|
||||
struct serpent_xts_ctx {
|
||||
struct serpent_ctx tweak_ctx;
|
||||
struct serpent_ctx crypt_ctx;
|
||||
};
|
||||
|
||||
asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
@ -16,4 +26,23 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
|
||||
le128 *iv);
|
||||
|
||||
extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
|
||||
extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen);
|
||||
|
||||
extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
|
||||
|
||||
extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen);
|
||||
|
||||
#endif
|
||||
|
@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
/* 8-way parallel cipher functions */
|
||||
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
/* helpers from twofish_x86_64-3way module */
|
||||
extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
|
||||
extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
|
||||
@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
|
||||
extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen);
|
||||
|
||||
/* helpers from twofish-avx module */
|
||||
extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
|
||||
|
||||
#endif /* ASM_X86_TWOFISH_H */
|
||||
|
133
crypto/Kconfig
133
crypto/Kconfig
@ -198,6 +198,7 @@ config CRYPTO_GCM
|
||||
select CRYPTO_CTR
|
||||
select CRYPTO_AEAD
|
||||
select CRYPTO_GHASH
|
||||
select CRYPTO_NULL
|
||||
help
|
||||
Support for Galois/Counter Mode (GCM) and Galois Message
|
||||
Authentication Code (GMAC). Required for IPSec.
|
||||
@ -282,6 +283,17 @@ config CRYPTO_XTS
|
||||
|
||||
comment "Hash modes"
|
||||
|
||||
config CRYPTO_CMAC
|
||||
tristate "CMAC support"
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_MANAGER
|
||||
help
|
||||
Cipher-based Message Authentication Code (CMAC) specified by
|
||||
The National Institute of Standards and Technology (NIST).
|
||||
|
||||
https://tools.ietf.org/html/rfc4493
|
||||
http://csrc.nist.gov/publications/nistpubs/800-38B/SP_800-38B.pdf
|
||||
|
||||
config CRYPTO_HMAC
|
||||
tristate "HMAC support"
|
||||
select CRYPTO_HASH
|
||||
@ -322,19 +334,9 @@ config CRYPTO_CRC32C
|
||||
by iSCSI for header and data digests and by others.
|
||||
See Castagnoli93. Module will be crc32c.
|
||||
|
||||
config CRYPTO_CRC32C_X86_64
|
||||
bool
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
In Intel processor with SSE4.2 supported, the processor will
|
||||
support CRC32C calculation using hardware accelerated CRC32
|
||||
instruction optimized with PCLMULQDQ instruction when available.
|
||||
|
||||
config CRYPTO_CRC32C_INTEL
|
||||
tristate "CRC32c INTEL hardware acceleration"
|
||||
depends on X86
|
||||
select CRYPTO_CRC32C_X86_64 if 64BIT
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
In Intel processor with SSE4.2 supported, the processor will
|
||||
@ -480,6 +482,28 @@ config CRYPTO_SHA1_SSSE3
|
||||
using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
|
||||
Extensions (AVX), when available.
|
||||
|
||||
config CRYPTO_SHA256_SSSE3
|
||||
tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_SHA256
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
SHA-256 secure hash standard (DFIPS 180-2) implemented
|
||||
using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
|
||||
Extensions version 1 (AVX1), or Advanced Vector Extensions
|
||||
version 2 (AVX2) instructions, when available.
|
||||
|
||||
config CRYPTO_SHA512_SSSE3
|
||||
tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_SHA512
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
SHA-512 secure hash standard (DFIPS 180-2) implemented
|
||||
using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
|
||||
Extensions version 1 (AVX1), or Advanced Vector Extensions
|
||||
version 2 (AVX2) instructions, when available.
|
||||
|
||||
config CRYPTO_SHA1_SPARC64
|
||||
tristate "SHA1 digest algorithm (SPARC64)"
|
||||
depends on SPARC64
|
||||
@ -654,6 +678,7 @@ config CRYPTO_AES_NI_INTEL
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_ABLK_HELPER_X86
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_GLUE_HELPER_X86 if 64BIT
|
||||
select CRYPTO_LRW
|
||||
select CRYPTO_XTS
|
||||
help
|
||||
@ -795,6 +820,24 @@ config CRYPTO_BLOWFISH_X86_64
|
||||
See also:
|
||||
<http://www.schneier.com/blowfish.html>
|
||||
|
||||
config CRYPTO_BLOWFISH_AVX2_X86_64
|
||||
tristate "Blowfish cipher algorithm (x86_64/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_ABLK_HELPER_X86
|
||||
select CRYPTO_BLOWFISH_COMMON
|
||||
select CRYPTO_BLOWFISH_X86_64
|
||||
help
|
||||
Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
|
||||
|
||||
This is a variable key length cipher which can use keys from 32
|
||||
bits to 448 bits in length. It's fast, simple and specifically
|
||||
designed for use on "large microprocessors".
|
||||
|
||||
See also:
|
||||
<http://www.schneier.com/blowfish.html>
|
||||
|
||||
config CRYPTO_CAMELLIA
|
||||
tristate "Camellia cipher algorithms"
|
||||
depends on CRYPTO
|
||||
@ -851,6 +894,29 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
|
||||
See also:
|
||||
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
|
||||
|
||||
config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
|
||||
tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
depends on CRYPTO
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_ABLK_HELPER_X86
|
||||
select CRYPTO_GLUE_HELPER_X86
|
||||
select CRYPTO_CAMELLIA_X86_64
|
||||
select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
|
||||
select CRYPTO_LRW
|
||||
select CRYPTO_XTS
|
||||
help
|
||||
Camellia cipher algorithm module (x86_64/AES-NI/AVX2).
|
||||
|
||||
Camellia is a symmetric key block cipher developed jointly
|
||||
at NTT and Mitsubishi Electric Corporation.
|
||||
|
||||
The Camellia specifies three key sizes: 128, 192 and 256 bits.
|
||||
|
||||
See also:
|
||||
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
|
||||
|
||||
config CRYPTO_CAMELLIA_SPARC64
|
||||
tristate "Camellia cipher algorithm (SPARC64)"
|
||||
depends on SPARC64
|
||||
@ -1088,6 +1154,29 @@ config CRYPTO_SERPENT_AVX_X86_64
|
||||
See also:
|
||||
<http://www.cl.cam.ac.uk/~rja14/serpent.html>
|
||||
|
||||
config CRYPTO_SERPENT_AVX2_X86_64
|
||||
tristate "Serpent cipher algorithm (x86_64/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_ABLK_HELPER_X86
|
||||
select CRYPTO_GLUE_HELPER_X86
|
||||
select CRYPTO_SERPENT
|
||||
select CRYPTO_SERPENT_AVX_X86_64
|
||||
select CRYPTO_LRW
|
||||
select CRYPTO_XTS
|
||||
help
|
||||
Serpent cipher algorithm, by Anderson, Biham & Knudsen.
|
||||
|
||||
Keys are allowed to be from 0 to 256 bits in length, in steps
|
||||
of 8 bits.
|
||||
|
||||
This module provides Serpent cipher algorithm that processes 16
|
||||
blocks parallel using AVX2 instruction set.
|
||||
|
||||
See also:
|
||||
<http://www.cl.cam.ac.uk/~rja14/serpent.html>
|
||||
|
||||
config CRYPTO_TEA
|
||||
tristate "TEA, XTEA and XETA cipher algorithms"
|
||||
select CRYPTO_ALGAPI
|
||||
@ -1207,6 +1296,30 @@ config CRYPTO_TWOFISH_AVX_X86_64
|
||||
See also:
|
||||
<http://www.schneier.com/twofish.html>
|
||||
|
||||
config CRYPTO_TWOFISH_AVX2_X86_64
|
||||
tristate "Twofish cipher algorithm (x86_64/AVX2)"
|
||||
depends on X86 && 64BIT
|
||||
select CRYPTO_ALGAPI
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_ABLK_HELPER_X86
|
||||
select CRYPTO_GLUE_HELPER_X86
|
||||
select CRYPTO_TWOFISH_COMMON
|
||||
select CRYPTO_TWOFISH_X86_64
|
||||
select CRYPTO_TWOFISH_X86_64_3WAY
|
||||
select CRYPTO_TWOFISH_AVX_X86_64
|
||||
select CRYPTO_LRW
|
||||
select CRYPTO_XTS
|
||||
help
|
||||
Twofish cipher algorithm (x86_64/AVX2).
|
||||
|
||||
Twofish was submitted as an AES (Advanced Encryption Standard)
|
||||
candidate cipher by researchers at CounterPane Systems. It is a
|
||||
16 round block cipher supporting key sizes of 128, 192, and 256
|
||||
bits.
|
||||
|
||||
See also:
|
||||
<http://www.schneier.com/twofish.html>
|
||||
|
||||
comment "Compression"
|
||||
|
||||
config CRYPTO_DEFLATE
|
||||
|
@ -32,6 +32,7 @@ cryptomgr-y := algboss.o testmgr.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
|
||||
obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
|
||||
obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
|
||||
obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
|
||||
obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
|
||||
obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o
|
||||
|
315
crypto/cmac.c
Normal file
315
crypto/cmac.c
Normal file
@ -0,0 +1,315 @@
|
||||
/*
|
||||
* CMAC: Cipher Block Mode for Authentication
|
||||
*
|
||||
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||||
*
|
||||
* Based on work by:
|
||||
* Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com>
|
||||
* Based on crypto/xcbc.c:
|
||||
* Copyright © 2006 USAGI/WIDE Project,
|
||||
* Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
/*
|
||||
* +------------------------
|
||||
* | <parent tfm>
|
||||
* +------------------------
|
||||
* | cmac_tfm_ctx
|
||||
* +------------------------
|
||||
* | consts (block size * 2)
|
||||
* +------------------------
|
||||
*/
|
||||
struct cmac_tfm_ctx {
|
||||
struct crypto_cipher *child;
|
||||
u8 ctx[];
|
||||
};
|
||||
|
||||
/*
|
||||
* +------------------------
|
||||
* | <shash desc>
|
||||
* +------------------------
|
||||
* | cmac_desc_ctx
|
||||
* +------------------------
|
||||
* | odds (block size)
|
||||
* +------------------------
|
||||
* | prev (block size)
|
||||
* +------------------------
|
||||
*/
|
||||
struct cmac_desc_ctx {
|
||||
unsigned int len;
|
||||
u8 ctx[];
|
||||
};
|
||||
|
||||
static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
|
||||
const u8 *inkey, unsigned int keylen)
|
||||
{
|
||||
unsigned long alignmask = crypto_shash_alignmask(parent);
|
||||
struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
|
||||
unsigned int bs = crypto_shash_blocksize(parent);
|
||||
__be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
|
||||
u64 _const[2];
|
||||
int i, err = 0;
|
||||
u8 msb_mask, gfmask;
|
||||
|
||||
err = crypto_cipher_setkey(ctx->child, inkey, keylen);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* encrypt the zero block */
|
||||
memset(consts, 0, bs);
|
||||
crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
|
||||
|
||||
switch (bs) {
|
||||
case 16:
|
||||
gfmask = 0x87;
|
||||
_const[0] = be64_to_cpu(consts[1]);
|
||||
_const[1] = be64_to_cpu(consts[0]);
|
||||
|
||||
/* gf(2^128) multiply zero-ciphertext with u and u^2 */
|
||||
for (i = 0; i < 4; i += 2) {
|
||||
msb_mask = ((s64)_const[1] >> 63) & gfmask;
|
||||
_const[1] = (_const[1] << 1) | (_const[0] >> 63);
|
||||
_const[0] = (_const[0] << 1) ^ msb_mask;
|
||||
|
||||
consts[i + 0] = cpu_to_be64(_const[1]);
|
||||
consts[i + 1] = cpu_to_be64(_const[0]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 8:
|
||||
gfmask = 0x1B;
|
||||
_const[0] = be64_to_cpu(consts[0]);
|
||||
|
||||
/* gf(2^64) multiply zero-ciphertext with u and u^2 */
|
||||
for (i = 0; i < 2; i++) {
|
||||
msb_mask = ((s64)_const[0] >> 63) & gfmask;
|
||||
_const[0] = (_const[0] << 1) ^ msb_mask;
|
||||
|
||||
consts[i] = cpu_to_be64(_const[0]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_cmac_digest_init(struct shash_desc *pdesc)
|
||||
{
|
||||
unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
|
||||
struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
|
||||
int bs = crypto_shash_blocksize(pdesc->tfm);
|
||||
u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
|
||||
|
||||
ctx->len = 0;
|
||||
memset(prev, 0, bs);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
|
||||
unsigned int len)
|
||||
{
|
||||
struct crypto_shash *parent = pdesc->tfm;
|
||||
unsigned long alignmask = crypto_shash_alignmask(parent);
|
||||
struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
|
||||
struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
|
||||
struct crypto_cipher *tfm = tctx->child;
|
||||
int bs = crypto_shash_blocksize(parent);
|
||||
u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
|
||||
u8 *prev = odds + bs;
|
||||
|
||||
/* checking the data can fill the block */
|
||||
if ((ctx->len + len) <= bs) {
|
||||
memcpy(odds + ctx->len, p, len);
|
||||
ctx->len += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* filling odds with new data and encrypting it */
|
||||
memcpy(odds + ctx->len, p, bs - ctx->len);
|
||||
len -= bs - ctx->len;
|
||||
p += bs - ctx->len;
|
||||
|
||||
crypto_xor(prev, odds, bs);
|
||||
crypto_cipher_encrypt_one(tfm, prev, prev);
|
||||
|
||||
/* clearing the length */
|
||||
ctx->len = 0;
|
||||
|
||||
/* encrypting the rest of data */
|
||||
while (len > bs) {
|
||||
crypto_xor(prev, p, bs);
|
||||
crypto_cipher_encrypt_one(tfm, prev, prev);
|
||||
p += bs;
|
||||
len -= bs;
|
||||
}
|
||||
|
||||
/* keeping the surplus of blocksize */
|
||||
if (len) {
|
||||
memcpy(odds, p, len);
|
||||
ctx->len = len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
|
||||
{
|
||||
struct crypto_shash *parent = pdesc->tfm;
|
||||
unsigned long alignmask = crypto_shash_alignmask(parent);
|
||||
struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
|
||||
struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
|
||||
struct crypto_cipher *tfm = tctx->child;
|
||||
int bs = crypto_shash_blocksize(parent);
|
||||
u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1);
|
||||
u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
|
||||
u8 *prev = odds + bs;
|
||||
unsigned int offset = 0;
|
||||
|
||||
if (ctx->len != bs) {
|
||||
unsigned int rlen;
|
||||
u8 *p = odds + ctx->len;
|
||||
|
||||
*p = 0x80;
|
||||
p++;
|
||||
|
||||
rlen = bs - ctx->len - 1;
|
||||
if (rlen)
|
||||
memset(p, 0, rlen);
|
||||
|
||||
offset += bs;
|
||||
}
|
||||
|
||||
crypto_xor(prev, odds, bs);
|
||||
crypto_xor(prev, consts + offset, bs);
|
||||
|
||||
crypto_cipher_encrypt_one(tfm, out, prev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cmac_init_tfm(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct crypto_cipher *cipher;
|
||||
struct crypto_instance *inst = (void *)tfm->__crt_alg;
|
||||
struct crypto_spawn *spawn = crypto_instance_ctx(inst);
|
||||
struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
|
||||
cipher = crypto_spawn_cipher(spawn);
|
||||
if (IS_ERR(cipher))
|
||||
return PTR_ERR(cipher);
|
||||
|
||||
ctx->child = cipher;
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
static void cmac_exit_tfm(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
crypto_free_cipher(ctx->child);
|
||||
}
|
||||
|
||||
static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
|
||||
{
|
||||
struct shash_instance *inst;
|
||||
struct crypto_alg *alg;
|
||||
unsigned long alignmask;
|
||||
int err;
|
||||
|
||||
err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
|
||||
CRYPTO_ALG_TYPE_MASK);
|
||||
if (IS_ERR(alg))
|
||||
return PTR_ERR(alg);
|
||||
|
||||
switch (alg->cra_blocksize) {
|
||||
case 16:
|
||||
case 8:
|
||||
break;
|
||||
default:
|
||||
goto out_put_alg;
|
||||
}
|
||||
|
||||
inst = shash_alloc_instance("cmac", alg);
|
||||
err = PTR_ERR(inst);
|
||||
if (IS_ERR(inst))
|
||||
goto out_put_alg;
|
||||
|
||||
err = crypto_init_spawn(shash_instance_ctx(inst), alg,
|
||||
shash_crypto_instance(inst),
|
||||
CRYPTO_ALG_TYPE_MASK);
|
||||
if (err)
|
||||
goto out_free_inst;
|
||||
|
||||
alignmask = alg->cra_alignmask | (sizeof(long) - 1);
|
||||
inst->alg.base.cra_alignmask = alignmask;
|
||||
inst->alg.base.cra_priority = alg->cra_priority;
|
||||
inst->alg.base.cra_blocksize = alg->cra_blocksize;
|
||||
|
||||
inst->alg.digestsize = alg->cra_blocksize;
|
||||
inst->alg.descsize =
|
||||
ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
|
||||
+ (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
|
||||
+ alg->cra_blocksize * 2;
|
||||
|
||||
inst->alg.base.cra_ctxsize =
|
||||
ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1)
|
||||
+ alg->cra_blocksize * 2;
|
||||
|
||||
inst->alg.base.cra_init = cmac_init_tfm;
|
||||
inst->alg.base.cra_exit = cmac_exit_tfm;
|
||||
|
||||
inst->alg.init = crypto_cmac_digest_init;
|
||||
inst->alg.update = crypto_cmac_digest_update;
|
||||
inst->alg.final = crypto_cmac_digest_final;
|
||||
inst->alg.setkey = crypto_cmac_digest_setkey;
|
||||
|
||||
err = shash_register_instance(tmpl, inst);
|
||||
if (err) {
|
||||
out_free_inst:
|
||||
shash_free_instance(shash_crypto_instance(inst));
|
||||
}
|
||||
|
||||
out_put_alg:
|
||||
crypto_mod_put(alg);
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct crypto_template crypto_cmac_tmpl = {
|
||||
.name = "cmac",
|
||||
.create = cmac_create,
|
||||
.free = shash_free_instance,
|
||||
.module = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init crypto_cmac_module_init(void)
|
||||
{
|
||||
return crypto_register_template(&crypto_cmac_tmpl);
|
||||
}
|
||||
|
||||
static void __exit crypto_cmac_module_exit(void)
|
||||
{
|
||||
crypto_unregister_template(&crypto_cmac_tmpl);
|
||||
}
|
||||
|
||||
module_init(crypto_cmac_module_init);
|
||||
module_exit(crypto_cmac_module_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("CMAC keyed hash algorithm");
|
@ -440,7 +440,7 @@ static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
|
||||
|
||||
#undef MSGSIZE
|
||||
|
||||
static struct crypto_link {
|
||||
static const struct crypto_link {
|
||||
int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
|
||||
int (*dump)(struct sk_buff *, struct netlink_callback *);
|
||||
int (*done)(struct netlink_callback *);
|
||||
@ -456,7 +456,7 @@ static struct crypto_link {
|
||||
static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
{
|
||||
struct nlattr *attrs[CRYPTOCFGA_MAX+1];
|
||||
struct crypto_link *link;
|
||||
const struct crypto_link *link;
|
||||
int type, err;
|
||||
|
||||
type = nlh->nlmsg_type;
|
||||
|
116
crypto/gcm.c
116
crypto/gcm.c
@ -37,8 +37,14 @@ struct crypto_rfc4106_ctx {
|
||||
u8 nonce[4];
|
||||
};
|
||||
|
||||
struct crypto_rfc4543_instance_ctx {
|
||||
struct crypto_aead_spawn aead;
|
||||
struct crypto_skcipher_spawn null;
|
||||
};
|
||||
|
||||
struct crypto_rfc4543_ctx {
|
||||
struct crypto_aead *child;
|
||||
struct crypto_blkcipher *null;
|
||||
u8 nonce[4];
|
||||
};
|
||||
|
||||
@ -1094,21 +1100,36 @@ static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
|
||||
return crypto_aead_setauthsize(ctx->child, authsize);
|
||||
}
|
||||
|
||||
static void crypto_rfc4543_done(struct crypto_async_request *areq, int err)
|
||||
{
|
||||
struct aead_request *req = areq->data;
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
|
||||
|
||||
if (!err) {
|
||||
scatterwalk_map_and_copy(rctx->auth_tag, req->dst,
|
||||
req->cryptlen,
|
||||
crypto_aead_authsize(aead), 1);
|
||||
}
|
||||
|
||||
aead_request_complete(req, err);
|
||||
}
|
||||
|
||||
static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
|
||||
int enc)
|
||||
bool enc)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
|
||||
struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
|
||||
struct aead_request *subreq = &rctx->subreq;
|
||||
struct scatterlist *dst = req->dst;
|
||||
struct scatterlist *src = req->src;
|
||||
struct scatterlist *cipher = rctx->cipher;
|
||||
struct scatterlist *payload = rctx->payload;
|
||||
struct scatterlist *assoc = rctx->assoc;
|
||||
unsigned int authsize = crypto_aead_authsize(aead);
|
||||
unsigned int assoclen = req->assoclen;
|
||||
struct page *dstp;
|
||||
u8 *vdst;
|
||||
struct page *srcp;
|
||||
u8 *vsrc;
|
||||
u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
|
||||
crypto_aead_alignmask(ctx->child) + 1);
|
||||
|
||||
@ -1119,19 +1140,19 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
|
||||
if (enc)
|
||||
memset(rctx->auth_tag, 0, authsize);
|
||||
else
|
||||
scatterwalk_map_and_copy(rctx->auth_tag, dst,
|
||||
scatterwalk_map_and_copy(rctx->auth_tag, src,
|
||||
req->cryptlen - authsize,
|
||||
authsize, 0);
|
||||
|
||||
sg_init_one(cipher, rctx->auth_tag, authsize);
|
||||
|
||||
/* construct the aad */
|
||||
dstp = sg_page(dst);
|
||||
vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset;
|
||||
srcp = sg_page(src);
|
||||
vsrc = PageHighMem(srcp) ? NULL : page_address(srcp) + src->offset;
|
||||
|
||||
sg_init_table(payload, 2);
|
||||
sg_set_buf(payload, req->iv, 8);
|
||||
scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2);
|
||||
scatterwalk_crypto_chain(payload, src, vsrc == req->iv + 8, 2);
|
||||
assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
|
||||
|
||||
if (req->assoc->length == req->assoclen) {
|
||||
@ -1150,14 +1171,27 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
|
||||
scatterwalk_crypto_chain(assoc, payload, 0, 2);
|
||||
|
||||
aead_request_set_tfm(subreq, ctx->child);
|
||||
aead_request_set_callback(subreq, req->base.flags, req->base.complete,
|
||||
req->base.data);
|
||||
aead_request_set_callback(subreq, req->base.flags, crypto_rfc4543_done,
|
||||
req);
|
||||
aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv);
|
||||
aead_request_set_assoc(subreq, assoc, assoclen);
|
||||
|
||||
return subreq;
|
||||
}
|
||||
|
||||
static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
|
||||
unsigned int authsize = crypto_aead_authsize(aead);
|
||||
unsigned int nbytes = req->cryptlen - (enc ? 0 : authsize);
|
||||
struct blkcipher_desc desc = {
|
||||
.tfm = ctx->null,
|
||||
};
|
||||
|
||||
return crypto_blkcipher_encrypt(&desc, req->dst, req->src, nbytes);
|
||||
}
|
||||
|
||||
static int crypto_rfc4543_encrypt(struct aead_request *req)
|
||||
{
|
||||
struct crypto_aead *aead = crypto_aead_reqtfm(req);
|
||||
@ -1165,7 +1199,13 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
|
||||
struct aead_request *subreq;
|
||||
int err;
|
||||
|
||||
subreq = crypto_rfc4543_crypt(req, 1);
|
||||
if (req->src != req->dst) {
|
||||
err = crypto_rfc4543_copy_src_to_dst(req, true);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
subreq = crypto_rfc4543_crypt(req, true);
|
||||
err = crypto_aead_encrypt(subreq);
|
||||
if (err)
|
||||
return err;
|
||||
@ -1178,7 +1218,15 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
|
||||
|
||||
static int crypto_rfc4543_decrypt(struct aead_request *req)
|
||||
{
|
||||
req = crypto_rfc4543_crypt(req, 0);
|
||||
int err;
|
||||
|
||||
if (req->src != req->dst) {
|
||||
err = crypto_rfc4543_copy_src_to_dst(req, false);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
req = crypto_rfc4543_crypt(req, false);
|
||||
|
||||
return crypto_aead_decrypt(req);
|
||||
}
|
||||
@ -1186,16 +1234,25 @@ static int crypto_rfc4543_decrypt(struct aead_request *req)
|
||||
static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct crypto_instance *inst = (void *)tfm->__crt_alg;
|
||||
struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst);
|
||||
struct crypto_rfc4543_instance_ctx *ictx = crypto_instance_ctx(inst);
|
||||
struct crypto_aead_spawn *spawn = &ictx->aead;
|
||||
struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
struct crypto_aead *aead;
|
||||
struct crypto_blkcipher *null;
|
||||
unsigned long align;
|
||||
int err = 0;
|
||||
|
||||
aead = crypto_spawn_aead(spawn);
|
||||
if (IS_ERR(aead))
|
||||
return PTR_ERR(aead);
|
||||
|
||||
null = crypto_spawn_blkcipher(&ictx->null.base);
|
||||
err = PTR_ERR(null);
|
||||
if (IS_ERR(null))
|
||||
goto err_free_aead;
|
||||
|
||||
ctx->child = aead;
|
||||
ctx->null = null;
|
||||
|
||||
align = crypto_aead_alignmask(aead);
|
||||
align &= ~(crypto_tfm_ctx_alignment() - 1);
|
||||
@ -1205,6 +1262,10 @@ static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
|
||||
align + 16;
|
||||
|
||||
return 0;
|
||||
|
||||
err_free_aead:
|
||||
crypto_free_aead(aead);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
|
||||
@ -1212,6 +1273,7 @@ static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
|
||||
struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
|
||||
crypto_free_aead(ctx->child);
|
||||
crypto_free_blkcipher(ctx->null);
|
||||
}
|
||||
|
||||
static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
@ -1220,6 +1282,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
struct crypto_instance *inst;
|
||||
struct crypto_aead_spawn *spawn;
|
||||
struct crypto_alg *alg;
|
||||
struct crypto_rfc4543_instance_ctx *ctx;
|
||||
const char *ccm_name;
|
||||
int err;
|
||||
|
||||
@ -1234,11 +1297,12 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
if (IS_ERR(ccm_name))
|
||||
return ERR_CAST(ccm_name);
|
||||
|
||||
inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
|
||||
inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
|
||||
if (!inst)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
spawn = crypto_instance_ctx(inst);
|
||||
ctx = crypto_instance_ctx(inst);
|
||||
spawn = &ctx->aead;
|
||||
crypto_set_aead_spawn(spawn, inst);
|
||||
err = crypto_grab_aead(spawn, ccm_name, 0,
|
||||
crypto_requires_sync(algt->type, algt->mask));
|
||||
@ -1247,15 +1311,23 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
|
||||
alg = crypto_aead_spawn_alg(spawn);
|
||||
|
||||
crypto_set_skcipher_spawn(&ctx->null, inst);
|
||||
err = crypto_grab_skcipher(&ctx->null, "ecb(cipher_null)", 0,
|
||||
CRYPTO_ALG_ASYNC);
|
||||
if (err)
|
||||
goto out_drop_alg;
|
||||
|
||||
crypto_skcipher_spawn_alg(&ctx->null);
|
||||
|
||||
err = -EINVAL;
|
||||
|
||||
/* We only support 16-byte blocks. */
|
||||
if (alg->cra_aead.ivsize != 16)
|
||||
goto out_drop_alg;
|
||||
goto out_drop_ecbnull;
|
||||
|
||||
/* Not a stream cipher? */
|
||||
if (alg->cra_blocksize != 1)
|
||||
goto out_drop_alg;
|
||||
goto out_drop_ecbnull;
|
||||
|
||||
err = -ENAMETOOLONG;
|
||||
if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
|
||||
@ -1263,7 +1335,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
|
||||
"rfc4543(%s)", alg->cra_driver_name) >=
|
||||
CRYPTO_MAX_ALG_NAME)
|
||||
goto out_drop_alg;
|
||||
goto out_drop_ecbnull;
|
||||
|
||||
inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD;
|
||||
inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC;
|
||||
@ -1290,6 +1362,8 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
|
||||
out:
|
||||
return inst;
|
||||
|
||||
out_drop_ecbnull:
|
||||
crypto_drop_skcipher(&ctx->null);
|
||||
out_drop_alg:
|
||||
crypto_drop_aead(spawn);
|
||||
out_free_inst:
|
||||
@ -1300,7 +1374,11 @@ out_free_inst:
|
||||
|
||||
static void crypto_rfc4543_free(struct crypto_instance *inst)
|
||||
{
|
||||
crypto_drop_spawn(crypto_instance_ctx(inst));
|
||||
struct crypto_rfc4543_instance_ctx *ctx = crypto_instance_ctx(inst);
|
||||
|
||||
crypto_drop_aead(&ctx->aead);
|
||||
crypto_drop_skcipher(&ctx->null);
|
||||
|
||||
kfree(inst);
|
||||
}
|
||||
|
||||
|
@ -246,7 +246,7 @@ static int sha256_init(struct shash_desc *desc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sha256_update(struct shash_desc *desc, const u8 *data,
|
||||
int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha256_state *sctx = shash_desc_ctx(desc);
|
||||
@ -277,6 +277,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(crypto_sha256_update);
|
||||
|
||||
static int sha256_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
@ -293,10 +294,10 @@ static int sha256_final(struct shash_desc *desc, u8 *out)
|
||||
/* Pad out to 56 mod 64. */
|
||||
index = sctx->count & 0x3f;
|
||||
pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
|
||||
sha256_update(desc, padding, pad_len);
|
||||
crypto_sha256_update(desc, padding, pad_len);
|
||||
|
||||
/* Append length (before padding) */
|
||||
sha256_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
|
||||
|
||||
/* Store state in digest */
|
||||
for (i = 0; i < 8; i++)
|
||||
@ -339,7 +340,7 @@ static int sha256_import(struct shash_desc *desc, const void *in)
|
||||
static struct shash_alg sha256_algs[2] = { {
|
||||
.digestsize = SHA256_DIGEST_SIZE,
|
||||
.init = sha256_init,
|
||||
.update = sha256_update,
|
||||
.update = crypto_sha256_update,
|
||||
.final = sha256_final,
|
||||
.export = sha256_export,
|
||||
.import = sha256_import,
|
||||
@ -355,7 +356,7 @@ static struct shash_alg sha256_algs[2] = { {
|
||||
}, {
|
||||
.digestsize = SHA224_DIGEST_SIZE,
|
||||
.init = sha224_init,
|
||||
.update = sha256_update,
|
||||
.update = crypto_sha256_update,
|
||||
.final = sha224_final,
|
||||
.descsize = sizeof(struct sha256_state),
|
||||
.base = {
|
||||
|
@ -163,8 +163,8 @@ sha384_init(struct shash_desc *desc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
|
||||
int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
struct sha512_state *sctx = shash_desc_ctx(desc);
|
||||
|
||||
@ -197,6 +197,7 @@ sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(crypto_sha512_update);
|
||||
|
||||
static int
|
||||
sha512_final(struct shash_desc *desc, u8 *hash)
|
||||
@ -215,10 +216,10 @@ sha512_final(struct shash_desc *desc, u8 *hash)
|
||||
/* Pad out to 112 mod 128. */
|
||||
index = sctx->count[0] & 0x7f;
|
||||
pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
|
||||
sha512_update(desc, padding, pad_len);
|
||||
crypto_sha512_update(desc, padding, pad_len);
|
||||
|
||||
/* Append length (before padding) */
|
||||
sha512_update(desc, (const u8 *)bits, sizeof(bits));
|
||||
crypto_sha512_update(desc, (const u8 *)bits, sizeof(bits));
|
||||
|
||||
/* Store state in digest */
|
||||
for (i = 0; i < 8; i++)
|
||||
@ -245,7 +246,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash)
|
||||
static struct shash_alg sha512_algs[2] = { {
|
||||
.digestsize = SHA512_DIGEST_SIZE,
|
||||
.init = sha512_init,
|
||||
.update = sha512_update,
|
||||
.update = crypto_sha512_update,
|
||||
.final = sha512_final,
|
||||
.descsize = sizeof(struct sha512_state),
|
||||
.base = {
|
||||
@ -257,7 +258,7 @@ static struct shash_alg sha512_algs[2] = { {
|
||||
}, {
|
||||
.digestsize = SHA384_DIGEST_SIZE,
|
||||
.init = sha384_init,
|
||||
.update = sha512_update,
|
||||
.update = crypto_sha512_update,
|
||||
.final = sha384_final,
|
||||
.descsize = sizeof(struct sha512_state),
|
||||
.base = {
|
||||
|
@ -1095,7 +1095,6 @@ static int do_test(int m)
|
||||
break;
|
||||
|
||||
case 28:
|
||||
|
||||
ret += tcrypt_test("tgr160");
|
||||
break;
|
||||
|
||||
@ -1118,6 +1117,7 @@ static int do_test(int m)
|
||||
ret += tcrypt_test("lrw(camellia)");
|
||||
ret += tcrypt_test("xts(camellia)");
|
||||
break;
|
||||
|
||||
case 33:
|
||||
ret += tcrypt_test("sha224");
|
||||
break;
|
||||
@ -1213,6 +1213,7 @@ static int do_test(int m)
|
||||
case 109:
|
||||
ret += tcrypt_test("vmac(aes)");
|
||||
break;
|
||||
|
||||
case 110:
|
||||
ret += tcrypt_test("hmac(crc32)");
|
||||
break;
|
||||
@ -1225,6 +1226,18 @@ static int do_test(int m)
|
||||
ret += tcrypt_test("rfc4106(gcm(aes))");
|
||||
break;
|
||||
|
||||
case 152:
|
||||
ret += tcrypt_test("rfc4543(gcm(aes))");
|
||||
break;
|
||||
|
||||
case 153:
|
||||
ret += tcrypt_test("cmac(aes)");
|
||||
break;
|
||||
|
||||
case 154:
|
||||
ret += tcrypt_test("cmac(des3_ede)");
|
||||
break;
|
||||
|
||||
case 200:
|
||||
test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
|
||||
speed_template_16_24_32);
|
||||
@ -1755,6 +1768,21 @@ static int do_test(int m)
|
||||
speed_template_32_64);
|
||||
break;
|
||||
|
||||
case 509:
|
||||
test_acipher_speed("ecb(blowfish)", ENCRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
test_acipher_speed("ecb(blowfish)", DECRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
test_acipher_speed("cbc(blowfish)", ENCRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
test_acipher_speed("cbc(blowfish)", DECRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
test_acipher_speed("ctr(blowfish)", ENCRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
test_acipher_speed("ctr(blowfish)", DECRYPT, sec, NULL, 0,
|
||||
speed_template_8_32);
|
||||
break;
|
||||
|
||||
case 1000:
|
||||
test_available();
|
||||
break;
|
||||
|
@ -1644,19 +1644,31 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}, {
|
||||
.alg = "__cbc-serpent-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__cbc-serpent-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__cbc-serpent-sse2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__cbc-twofish-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__cbc-twofish-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-aes-aesni",
|
||||
.test = alg_test_null,
|
||||
.fips_allowed = 1,
|
||||
}, {
|
||||
.alg = "__driver-cbc-blowfish-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-camellia-aesni",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-camellia-aesni-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-cast5-avx",
|
||||
.test = alg_test_null,
|
||||
@ -1666,19 +1678,31 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}, {
|
||||
.alg = "__driver-cbc-serpent-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-serpent-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-serpent-sse2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-twofish-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-cbc-twofish-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-aes-aesni",
|
||||
.test = alg_test_null,
|
||||
.fips_allowed = 1,
|
||||
}, {
|
||||
.alg = "__driver-ecb-blowfish-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-camellia-aesni",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-camellia-aesni-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-cast5-avx",
|
||||
.test = alg_test_null,
|
||||
@ -1688,12 +1712,18 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}, {
|
||||
.alg = "__driver-ecb-serpent-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-serpent-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-serpent-sse2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-twofish-avx",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__driver-ecb-twofish-avx2",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "__ghash-pclmulqdqni",
|
||||
.test = alg_test_null,
|
||||
@ -1912,6 +1942,27 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "cmac(aes)",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = {
|
||||
.vecs = aes_cmac128_tv_template,
|
||||
.count = CMAC_AES_TEST_VECTORS
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "cmac(des3_ede)",
|
||||
.test = alg_test_hash,
|
||||
.suite = {
|
||||
.hash = {
|
||||
.vecs = des3_ede_cmac64_tv_template,
|
||||
.count = CMAC_DES3_EDE_TEST_VECTORS
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "compress_null",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "crc32c",
|
||||
.test = alg_test_crc32c,
|
||||
@ -1926,16 +1977,31 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
.alg = "cryptd(__driver-cbc-aes-aesni)",
|
||||
.test = alg_test_null,
|
||||
.fips_allowed = 1,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-cbc-blowfish-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-cbc-camellia-aesni)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-cbc-camellia-aesni-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-cbc-serpent-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-aes-aesni)",
|
||||
.test = alg_test_null,
|
||||
.fips_allowed = 1,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-blowfish-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-camellia-aesni)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-camellia-aesni-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-cast5-avx)",
|
||||
.test = alg_test_null,
|
||||
@ -1945,12 +2011,18 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-serpent-avx)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-serpent-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-serpent-sse2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-twofish-avx)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-ecb-twofish-avx2)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "cryptd(__driver-gcm-aes-aesni)",
|
||||
.test = alg_test_null,
|
||||
@ -2126,6 +2198,9 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "digest_null",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "ecb(__aes-aesni)",
|
||||
.test = alg_test_null,
|
||||
@ -2236,6 +2311,9 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "ecb(cipher_null)",
|
||||
.test = alg_test_null,
|
||||
}, {
|
||||
.alg = "ecb(des)",
|
||||
.test = alg_test_skcipher,
|
||||
@ -2696,8 +2774,6 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}
|
||||
}
|
||||
}, {
|
||||
|
||||
|
||||
.alg = "rfc4309(ccm(aes))",
|
||||
.test = alg_test_aead,
|
||||
.fips_allowed = 1,
|
||||
@ -2713,6 +2789,21 @@ static const struct alg_test_desc alg_test_descs[] = {
|
||||
}
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "rfc4543(gcm(aes))",
|
||||
.test = alg_test_aead,
|
||||
.suite = {
|
||||
.aead = {
|
||||
.enc = {
|
||||
.vecs = aes_gcm_rfc4543_enc_tv_template,
|
||||
.count = AES_GCM_4543_ENC_TEST_VECTORS
|
||||
},
|
||||
.dec = {
|
||||
.vecs = aes_gcm_rfc4543_dec_tv_template,
|
||||
.count = AES_GCM_4543_DEC_TEST_VECTORS
|
||||
},
|
||||
}
|
||||
}
|
||||
}, {
|
||||
.alg = "rmd128",
|
||||
.test = alg_test_hash,
|
||||
|
1314
crypto/testmgr.h
1314
crypto/testmgr.h
File diff suppressed because it is too large
Load Diff
@ -86,6 +86,18 @@ config HW_RANDOM_BCM63XX
|
||||
|
||||
If unusure, say Y.
|
||||
|
||||
config HW_RANDOM_BCM2835
|
||||
tristate "Broadcom BCM2835 Random Number Generator support"
|
||||
depends on HW_RANDOM && ARCH_BCM2835
|
||||
default HW_RANDOM
|
||||
---help---
|
||||
This driver provides kernel-side support for the Random Number
|
||||
Generator hardware found on the Broadcom BCM2835 SoCs.
|
||||
|
||||
To compile this driver as a module, choose M here: the
|
||||
module will be called bcm2835-rng
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config HW_RANDOM_GEODE
|
||||
tristate "AMD Geode HW Random Number Generator support"
|
||||
|
@ -26,3 +26,4 @@ obj-$(CONFIG_HW_RANDOM_PPC4XX) += ppc4xx-rng.o
|
||||
obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o
|
||||
obj-$(CONFIG_HW_RANDOM_EXYNOS) += exynos-rng.o
|
||||
obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o
|
||||
obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o
|
||||
|
113
drivers/char/hw_random/bcm2835-rng.c
Normal file
113
drivers/char/hw_random/bcm2835-rng.c
Normal file
@ -0,0 +1,113 @@
|
||||
/**
|
||||
* Copyright (c) 2010-2012 Broadcom. All rights reserved.
|
||||
* Copyright (c) 2013 Lubomir Rintel
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License ("GPL")
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/hw_random.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/of_address.h>
|
||||
#include <linux/of_platform.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/printk.h>
|
||||
|
||||
#define RNG_CTRL 0x0
|
||||
#define RNG_STATUS 0x4
|
||||
#define RNG_DATA 0x8
|
||||
|
||||
/* enable rng */
|
||||
#define RNG_RBGEN 0x1
|
||||
|
||||
/* the initial numbers generated are "less random" so will be discarded */
|
||||
#define RNG_WARMUP_COUNT 0x40000
|
||||
|
||||
static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max,
|
||||
bool wait)
|
||||
{
|
||||
void __iomem *rng_base = (void __iomem *)rng->priv;
|
||||
|
||||
while ((__raw_readl(rng_base + RNG_STATUS) >> 24) == 0) {
|
||||
if (!wait)
|
||||
return 0;
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
*(u32 *)buf = __raw_readl(rng_base + RNG_DATA);
|
||||
return sizeof(u32);
|
||||
}
|
||||
|
||||
static struct hwrng bcm2835_rng_ops = {
|
||||
.name = "bcm2835",
|
||||
.read = bcm2835_rng_read,
|
||||
};
|
||||
|
||||
static int bcm2835_rng_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct device *dev = &pdev->dev;
|
||||
struct device_node *np = dev->of_node;
|
||||
void __iomem *rng_base;
|
||||
int err;
|
||||
|
||||
/* map peripheral */
|
||||
rng_base = of_iomap(np, 0);
|
||||
if (!rng_base) {
|
||||
dev_err(dev, "failed to remap rng regs");
|
||||
return -ENODEV;
|
||||
}
|
||||
bcm2835_rng_ops.priv = (unsigned long)rng_base;
|
||||
|
||||
/* register driver */
|
||||
err = hwrng_register(&bcm2835_rng_ops);
|
||||
if (err) {
|
||||
dev_err(dev, "hwrng registration failed\n");
|
||||
iounmap(rng_base);
|
||||
} else {
|
||||
dev_info(dev, "hwrng registered\n");
|
||||
|
||||
/* set warm-up count & enable */
|
||||
__raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS);
|
||||
__raw_writel(RNG_RBGEN, rng_base + RNG_CTRL);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int bcm2835_rng_remove(struct platform_device *pdev)
|
||||
{
|
||||
void __iomem *rng_base = (void __iomem *)bcm2835_rng_ops.priv;
|
||||
|
||||
/* disable rng hardware */
|
||||
__raw_writel(0, rng_base + RNG_CTRL);
|
||||
|
||||
/* unregister driver */
|
||||
hwrng_unregister(&bcm2835_rng_ops);
|
||||
iounmap(rng_base);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct of_device_id bcm2835_rng_of_match[] = {
|
||||
{ .compatible = "brcm,bcm2835-rng", },
|
||||
{},
|
||||
};
|
||||
MODULE_DEVICE_TABLE(of, bcm2835_rng_of_match);
|
||||
|
||||
static struct platform_driver bcm2835_rng_driver = {
|
||||
.driver = {
|
||||
.name = "bcm2835-rng",
|
||||
.owner = THIS_MODULE,
|
||||
.of_match_table = bcm2835_rng_of_match,
|
||||
},
|
||||
.probe = bcm2835_rng_probe,
|
||||
.remove = bcm2835_rng_remove,
|
||||
};
|
||||
module_platform_driver(bcm2835_rng_driver);
|
||||
|
||||
MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>");
|
||||
MODULE_DESCRIPTION("BCM2835 Random Number Generator (RNG) driver");
|
||||
MODULE_LICENSE("GPLv2");
|
@ -144,6 +144,7 @@ static int exynos_rng_remove(struct platform_device *pdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_PM_SLEEP) || defined(CONFIG_PM_RUNTIME)
|
||||
static int exynos_rng_runtime_suspend(struct device *dev)
|
||||
{
|
||||
struct platform_device *pdev = to_platform_device(dev);
|
||||
@ -161,7 +162,7 @@ static int exynos_rng_runtime_resume(struct device *dev)
|
||||
|
||||
return clk_prepare_enable(exynos_rng->clk);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static UNIVERSAL_DEV_PM_OPS(exynos_rng_pm_ops, exynos_rng_runtime_suspend,
|
||||
exynos_rng_runtime_resume, NULL);
|
||||
|
@ -142,7 +142,7 @@ static void mxc_rnga_cleanup(struct hwrng *rng)
|
||||
static int __init mxc_rnga_probe(struct platform_device *pdev)
|
||||
{
|
||||
int err = -ENODEV;
|
||||
struct resource *res, *mem;
|
||||
struct resource *res;
|
||||
struct mxc_rng *mxc_rng;
|
||||
|
||||
mxc_rng = devm_kzalloc(&pdev->dev, sizeof(struct mxc_rng),
|
||||
@ -172,15 +172,9 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
|
||||
goto err_region;
|
||||
}
|
||||
|
||||
mem = request_mem_region(res->start, resource_size(res), pdev->name);
|
||||
if (mem == NULL) {
|
||||
err = -EBUSY;
|
||||
goto err_region;
|
||||
}
|
||||
|
||||
mxc_rng->mem = ioremap(res->start, resource_size(res));
|
||||
if (!mxc_rng->mem) {
|
||||
err = -ENOMEM;
|
||||
mxc_rng->mem = devm_ioremap_resource(&pdev->dev, res);
|
||||
if (IS_ERR(mxc_rng->mem)) {
|
||||
err = PTR_ERR(mxc_rng->mem);
|
||||
goto err_ioremap;
|
||||
}
|
||||
|
||||
@ -195,8 +189,6 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
|
||||
return 0;
|
||||
|
||||
err_ioremap:
|
||||
release_mem_region(res->start, resource_size(res));
|
||||
|
||||
err_region:
|
||||
clk_disable_unprepare(mxc_rng->clk);
|
||||
|
||||
@ -206,15 +198,10 @@ out:
|
||||
|
||||
static int __exit mxc_rnga_remove(struct platform_device *pdev)
|
||||
{
|
||||
struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
|
||||
struct mxc_rng *mxc_rng = platform_get_drvdata(pdev);
|
||||
|
||||
hwrng_unregister(&mxc_rng->rng);
|
||||
|
||||
iounmap(mxc_rng->mem);
|
||||
|
||||
release_mem_region(res->start, resource_size(res));
|
||||
|
||||
clk_disable_unprepare(mxc_rng->clk);
|
||||
|
||||
return 0;
|
||||
|
@ -23,127 +23,209 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/hw_random.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/timeriomem-rng.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/completion.h>
|
||||
|
||||
static struct timeriomem_rng_data *timeriomem_rng_data;
|
||||
struct timeriomem_rng_private_data {
|
||||
void __iomem *io_base;
|
||||
unsigned int expires;
|
||||
unsigned int period;
|
||||
unsigned int present:1;
|
||||
|
||||
static void timeriomem_rng_trigger(unsigned long);
|
||||
static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0);
|
||||
struct timer_list timer;
|
||||
struct completion completion;
|
||||
|
||||
struct hwrng timeriomem_rng_ops;
|
||||
};
|
||||
|
||||
#define to_rng_priv(rng) \
|
||||
((struct timeriomem_rng_private_data *)rng->priv)
|
||||
|
||||
/*
|
||||
* have data return 1, however return 0 if we have nothing
|
||||
*/
|
||||
static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
|
||||
{
|
||||
if (rng->priv == 0)
|
||||
return 1;
|
||||
struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
|
||||
|
||||
if (!wait || timeriomem_rng_data->present)
|
||||
return timeriomem_rng_data->present;
|
||||
if (!wait || priv->present)
|
||||
return priv->present;
|
||||
|
||||
wait_for_completion(&timeriomem_rng_data->completion);
|
||||
wait_for_completion(&priv->completion);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
|
||||
{
|
||||
struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
|
||||
unsigned long cur;
|
||||
s32 delay;
|
||||
|
||||
*data = readl(timeriomem_rng_data->address);
|
||||
*data = readl(priv->io_base);
|
||||
|
||||
if (rng->priv != 0) {
|
||||
cur = jiffies;
|
||||
cur = jiffies;
|
||||
|
||||
delay = cur - timeriomem_rng_timer.expires;
|
||||
delay = rng->priv - (delay % rng->priv);
|
||||
delay = cur - priv->expires;
|
||||
delay = priv->period - (delay % priv->period);
|
||||
|
||||
timeriomem_rng_timer.expires = cur + delay;
|
||||
timeriomem_rng_data->present = 0;
|
||||
priv->expires = cur + delay;
|
||||
priv->present = 0;
|
||||
|
||||
init_completion(&timeriomem_rng_data->completion);
|
||||
add_timer(&timeriomem_rng_timer);
|
||||
}
|
||||
INIT_COMPLETION(priv->completion);
|
||||
mod_timer(&priv->timer, priv->expires);
|
||||
|
||||
return 4;
|
||||
}
|
||||
|
||||
static void timeriomem_rng_trigger(unsigned long dummy)
|
||||
static void timeriomem_rng_trigger(unsigned long data)
|
||||
{
|
||||
timeriomem_rng_data->present = 1;
|
||||
complete(&timeriomem_rng_data->completion);
|
||||
}
|
||||
struct timeriomem_rng_private_data *priv
|
||||
= (struct timeriomem_rng_private_data *)data;
|
||||
|
||||
static struct hwrng timeriomem_rng_ops = {
|
||||
.name = "timeriomem",
|
||||
.data_present = timeriomem_rng_data_present,
|
||||
.data_read = timeriomem_rng_data_read,
|
||||
.priv = 0,
|
||||
};
|
||||
priv->present = 1;
|
||||
complete(&priv->completion);
|
||||
}
|
||||
|
||||
static int timeriomem_rng_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct timeriomem_rng_data *pdata = pdev->dev.platform_data;
|
||||
struct timeriomem_rng_private_data *priv;
|
||||
struct resource *res;
|
||||
int ret;
|
||||
int err = 0;
|
||||
int period;
|
||||
|
||||
if (!pdev->dev.of_node && !pdata) {
|
||||
dev_err(&pdev->dev, "timeriomem_rng_data is missing\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
|
||||
|
||||
if (!res)
|
||||
return -ENOENT;
|
||||
return -ENXIO;
|
||||
|
||||
timeriomem_rng_data = pdev->dev.platform_data;
|
||||
|
||||
timeriomem_rng_data->address = ioremap(res->start, resource_size(res));
|
||||
if (!timeriomem_rng_data->address)
|
||||
return -EIO;
|
||||
|
||||
if (timeriomem_rng_data->period != 0
|
||||
&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
|
||||
timeriomem_rng_timer.expires = jiffies;
|
||||
|
||||
timeriomem_rng_ops.priv = usecs_to_jiffies(
|
||||
timeriomem_rng_data->period);
|
||||
if (res->start % 4 != 0 || resource_size(res) != 4) {
|
||||
dev_err(&pdev->dev,
|
||||
"address must be four bytes wide and aligned\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
timeriomem_rng_data->present = 1;
|
||||
|
||||
ret = hwrng_register(&timeriomem_rng_ops);
|
||||
if (ret)
|
||||
goto failed;
|
||||
/* Allocate memory for the device structure (and zero it) */
|
||||
priv = kzalloc(sizeof(struct timeriomem_rng_private_data), GFP_KERNEL);
|
||||
if (!priv) {
|
||||
dev_err(&pdev->dev, "failed to allocate device structure.\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
platform_set_drvdata(pdev, priv);
|
||||
|
||||
if (pdev->dev.of_node) {
|
||||
int i;
|
||||
|
||||
if (!of_property_read_u32(pdev->dev.of_node,
|
||||
"period", &i))
|
||||
period = i;
|
||||
else {
|
||||
dev_err(&pdev->dev, "missing period\n");
|
||||
err = -EINVAL;
|
||||
goto out_free;
|
||||
}
|
||||
} else
|
||||
period = pdata->period;
|
||||
|
||||
priv->period = usecs_to_jiffies(period);
|
||||
if (priv->period < 1) {
|
||||
dev_err(&pdev->dev, "period is less than one jiffy\n");
|
||||
err = -EINVAL;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
priv->expires = jiffies;
|
||||
priv->present = 1;
|
||||
|
||||
init_completion(&priv->completion);
|
||||
complete(&priv->completion);
|
||||
|
||||
setup_timer(&priv->timer, timeriomem_rng_trigger, (unsigned long)priv);
|
||||
|
||||
priv->timeriomem_rng_ops.name = dev_name(&pdev->dev);
|
||||
priv->timeriomem_rng_ops.data_present = timeriomem_rng_data_present;
|
||||
priv->timeriomem_rng_ops.data_read = timeriomem_rng_data_read;
|
||||
priv->timeriomem_rng_ops.priv = (unsigned long)priv;
|
||||
|
||||
if (!request_mem_region(res->start, resource_size(res),
|
||||
dev_name(&pdev->dev))) {
|
||||
dev_err(&pdev->dev, "request_mem_region failed\n");
|
||||
err = -EBUSY;
|
||||
goto out_timer;
|
||||
}
|
||||
|
||||
priv->io_base = ioremap(res->start, resource_size(res));
|
||||
if (priv->io_base == NULL) {
|
||||
dev_err(&pdev->dev, "ioremap failed\n");
|
||||
err = -EIO;
|
||||
goto out_release_io;
|
||||
}
|
||||
|
||||
err = hwrng_register(&priv->timeriomem_rng_ops);
|
||||
if (err) {
|
||||
dev_err(&pdev->dev, "problem registering\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
|
||||
timeriomem_rng_data->address,
|
||||
timeriomem_rng_data->period);
|
||||
priv->io_base, period);
|
||||
|
||||
return 0;
|
||||
|
||||
failed:
|
||||
dev_err(&pdev->dev, "problem registering\n");
|
||||
iounmap(timeriomem_rng_data->address);
|
||||
|
||||
return ret;
|
||||
out:
|
||||
iounmap(priv->io_base);
|
||||
out_release_io:
|
||||
release_mem_region(res->start, resource_size(res));
|
||||
out_timer:
|
||||
del_timer_sync(&priv->timer);
|
||||
out_free:
|
||||
platform_set_drvdata(pdev, NULL);
|
||||
kfree(priv);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int timeriomem_rng_remove(struct platform_device *pdev)
|
||||
{
|
||||
del_timer_sync(&timeriomem_rng_timer);
|
||||
hwrng_unregister(&timeriomem_rng_ops);
|
||||
struct timeriomem_rng_private_data *priv = platform_get_drvdata(pdev);
|
||||
struct resource *res;
|
||||
|
||||
iounmap(timeriomem_rng_data->address);
|
||||
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
|
||||
|
||||
hwrng_unregister(&priv->timeriomem_rng_ops);
|
||||
|
||||
del_timer_sync(&priv->timer);
|
||||
iounmap(priv->io_base);
|
||||
release_mem_region(res->start, resource_size(res));
|
||||
platform_set_drvdata(pdev, NULL);
|
||||
kfree(priv);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct of_device_id timeriomem_rng_match[] = {
|
||||
{ .compatible = "timeriomem_rng" },
|
||||
{},
|
||||
};
|
||||
MODULE_DEVICE_TABLE(of, timeriomem_rng_match);
|
||||
|
||||
static struct platform_driver timeriomem_rng_driver = {
|
||||
.driver = {
|
||||
.name = "timeriomem_rng",
|
||||
.owner = THIS_MODULE,
|
||||
.of_match_table = timeriomem_rng_match,
|
||||
},
|
||||
.probe = timeriomem_rng_probe,
|
||||
.remove = timeriomem_rng_remove,
|
||||
|
@ -276,6 +276,16 @@ config CRYPTO_DEV_PICOXCELL
|
||||
|
||||
Saying m here will build a module named pipcoxcell_crypto.
|
||||
|
||||
config CRYPTO_DEV_SAHARA
|
||||
tristate "Support for SAHARA crypto accelerator"
|
||||
depends on ARCH_MXC && EXPERIMENTAL && OF
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_ECB
|
||||
help
|
||||
This option enables support for the SAHARA HW crypto accelerator
|
||||
found in some Freescale i.MX chips.
|
||||
|
||||
config CRYPTO_DEV_S5P
|
||||
tristate "Support for Samsung S5PV210 crypto accelerator"
|
||||
depends on ARCH_S5PV210
|
||||
@ -361,15 +371,17 @@ config CRYPTO_DEV_ATMEL_TDES
|
||||
will be called atmel-tdes.
|
||||
|
||||
config CRYPTO_DEV_ATMEL_SHA
|
||||
tristate "Support for Atmel SHA1/SHA256 hw accelerator"
|
||||
tristate "Support for Atmel SHA hw accelerator"
|
||||
depends on ARCH_AT91
|
||||
select CRYPTO_SHA1
|
||||
select CRYPTO_SHA256
|
||||
select CRYPTO_SHA512
|
||||
select CRYPTO_ALGAPI
|
||||
help
|
||||
Some Atmel processors have SHA1/SHA256 hw accelerator.
|
||||
Some Atmel processors have SHA1/SHA224/SHA256/SHA384/SHA512
|
||||
hw accelerator.
|
||||
Select this if you want to use the Atmel module for
|
||||
SHA1/SHA256 algorithms.
|
||||
SHA1/SHA224/SHA256/SHA384/SHA512 algorithms.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called atmel-sha.
|
||||
|
@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += amcc/
|
||||
obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
|
||||
obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
|
||||
|
@ -38,7 +38,7 @@
|
||||
#include <crypto/aes.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/platform_data/atmel-aes.h>
|
||||
#include <linux/platform_data/crypto-atmel.h>
|
||||
#include "atmel-aes-regs.h"
|
||||
|
||||
#define CFB8_BLOCK_SIZE 1
|
||||
@ -47,7 +47,7 @@
|
||||
#define CFB64_BLOCK_SIZE 8
|
||||
|
||||
/* AES flags */
|
||||
#define AES_FLAGS_MODE_MASK 0x01ff
|
||||
#define AES_FLAGS_MODE_MASK 0x03ff
|
||||
#define AES_FLAGS_ENCRYPT BIT(0)
|
||||
#define AES_FLAGS_CBC BIT(1)
|
||||
#define AES_FLAGS_CFB BIT(2)
|
||||
@ -55,21 +55,26 @@
|
||||
#define AES_FLAGS_CFB16 BIT(4)
|
||||
#define AES_FLAGS_CFB32 BIT(5)
|
||||
#define AES_FLAGS_CFB64 BIT(6)
|
||||
#define AES_FLAGS_OFB BIT(7)
|
||||
#define AES_FLAGS_CTR BIT(8)
|
||||
#define AES_FLAGS_CFB128 BIT(7)
|
||||
#define AES_FLAGS_OFB BIT(8)
|
||||
#define AES_FLAGS_CTR BIT(9)
|
||||
|
||||
#define AES_FLAGS_INIT BIT(16)
|
||||
#define AES_FLAGS_DMA BIT(17)
|
||||
#define AES_FLAGS_BUSY BIT(18)
|
||||
#define AES_FLAGS_FAST BIT(19)
|
||||
|
||||
#define AES_FLAGS_DUALBUFF BIT(24)
|
||||
|
||||
#define ATMEL_AES_QUEUE_LENGTH 1
|
||||
#define ATMEL_AES_CACHE_SIZE 0
|
||||
#define ATMEL_AES_QUEUE_LENGTH 50
|
||||
|
||||
#define ATMEL_AES_DMA_THRESHOLD 16
|
||||
|
||||
|
||||
struct atmel_aes_caps {
|
||||
bool has_dualbuff;
|
||||
bool has_cfb64;
|
||||
u32 max_burst_size;
|
||||
};
|
||||
|
||||
struct atmel_aes_dev;
|
||||
|
||||
struct atmel_aes_ctx {
|
||||
@ -77,6 +82,8 @@ struct atmel_aes_ctx {
|
||||
|
||||
int keylen;
|
||||
u32 key[AES_KEYSIZE_256 / sizeof(u32)];
|
||||
|
||||
u16 block_size;
|
||||
};
|
||||
|
||||
struct atmel_aes_reqctx {
|
||||
@ -112,20 +119,27 @@ struct atmel_aes_dev {
|
||||
|
||||
struct scatterlist *in_sg;
|
||||
unsigned int nb_in_sg;
|
||||
|
||||
size_t in_offset;
|
||||
struct scatterlist *out_sg;
|
||||
unsigned int nb_out_sg;
|
||||
size_t out_offset;
|
||||
|
||||
size_t bufcnt;
|
||||
size_t buflen;
|
||||
size_t dma_size;
|
||||
|
||||
u8 buf_in[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
|
||||
int dma_in;
|
||||
void *buf_in;
|
||||
int dma_in;
|
||||
dma_addr_t dma_addr_in;
|
||||
struct atmel_aes_dma dma_lch_in;
|
||||
|
||||
u8 buf_out[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
|
||||
int dma_out;
|
||||
void *buf_out;
|
||||
int dma_out;
|
||||
dma_addr_t dma_addr_out;
|
||||
struct atmel_aes_dma dma_lch_out;
|
||||
|
||||
struct atmel_aes_caps caps;
|
||||
|
||||
u32 hw_version;
|
||||
};
|
||||
|
||||
@ -165,6 +179,37 @@ static int atmel_aes_sg_length(struct ablkcipher_request *req,
|
||||
return sg_nb;
|
||||
}
|
||||
|
||||
static int atmel_aes_sg_copy(struct scatterlist **sg, size_t *offset,
|
||||
void *buf, size_t buflen, size_t total, int out)
|
||||
{
|
||||
unsigned int count, off = 0;
|
||||
|
||||
while (buflen && total) {
|
||||
count = min((*sg)->length - *offset, total);
|
||||
count = min(count, buflen);
|
||||
|
||||
if (!count)
|
||||
return off;
|
||||
|
||||
scatterwalk_map_and_copy(buf + off, *sg, *offset, count, out);
|
||||
|
||||
off += count;
|
||||
buflen -= count;
|
||||
*offset += count;
|
||||
total -= count;
|
||||
|
||||
if (*offset == (*sg)->length) {
|
||||
*sg = sg_next(*sg);
|
||||
if (*sg)
|
||||
*offset = 0;
|
||||
else
|
||||
total = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return off;
|
||||
}
|
||||
|
||||
static inline u32 atmel_aes_read(struct atmel_aes_dev *dd, u32 offset)
|
||||
{
|
||||
return readl_relaxed(dd->io_base + offset);
|
||||
@ -190,14 +235,6 @@ static void atmel_aes_write_n(struct atmel_aes_dev *dd, u32 offset,
|
||||
atmel_aes_write(dd, offset, *value);
|
||||
}
|
||||
|
||||
static void atmel_aes_dualbuff_test(struct atmel_aes_dev *dd)
|
||||
{
|
||||
atmel_aes_write(dd, AES_MR, AES_MR_DUALBUFF);
|
||||
|
||||
if (atmel_aes_read(dd, AES_MR) & AES_MR_DUALBUFF)
|
||||
dd->flags |= AES_FLAGS_DUALBUFF;
|
||||
}
|
||||
|
||||
static struct atmel_aes_dev *atmel_aes_find_dev(struct atmel_aes_ctx *ctx)
|
||||
{
|
||||
struct atmel_aes_dev *aes_dd = NULL;
|
||||
@ -225,7 +262,7 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
|
||||
|
||||
if (!(dd->flags & AES_FLAGS_INIT)) {
|
||||
atmel_aes_write(dd, AES_CR, AES_CR_SWRST);
|
||||
atmel_aes_dualbuff_test(dd);
|
||||
atmel_aes_write(dd, AES_MR, 0xE << AES_MR_CKEY_OFFSET);
|
||||
dd->flags |= AES_FLAGS_INIT;
|
||||
dd->err = 0;
|
||||
}
|
||||
@ -233,11 +270,19 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int atmel_aes_get_version(struct atmel_aes_dev *dd)
|
||||
{
|
||||
return atmel_aes_read(dd, AES_HW_VERSION) & 0x00000fff;
|
||||
}
|
||||
|
||||
static void atmel_aes_hw_version_init(struct atmel_aes_dev *dd)
|
||||
{
|
||||
atmel_aes_hw_init(dd);
|
||||
|
||||
dd->hw_version = atmel_aes_read(dd, AES_HW_VERSION);
|
||||
dd->hw_version = atmel_aes_get_version(dd);
|
||||
|
||||
dev_info(dd->dev,
|
||||
"version: 0x%x\n", dd->hw_version);
|
||||
|
||||
clk_disable_unprepare(dd->iclk);
|
||||
}
|
||||
@ -260,50 +305,77 @@ static void atmel_aes_dma_callback(void *data)
|
||||
tasklet_schedule(&dd->done_task);
|
||||
}
|
||||
|
||||
static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
|
||||
static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd,
|
||||
dma_addr_t dma_addr_in, dma_addr_t dma_addr_out, int length)
|
||||
{
|
||||
struct scatterlist sg[2];
|
||||
struct dma_async_tx_descriptor *in_desc, *out_desc;
|
||||
int nb_dma_sg_in, nb_dma_sg_out;
|
||||
|
||||
dd->nb_in_sg = atmel_aes_sg_length(dd->req, dd->in_sg);
|
||||
if (!dd->nb_in_sg)
|
||||
goto exit_err;
|
||||
dd->dma_size = length;
|
||||
|
||||
nb_dma_sg_in = dma_map_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
|
||||
DMA_TO_DEVICE);
|
||||
if (!nb_dma_sg_in)
|
||||
goto exit_err;
|
||||
if (!(dd->flags & AES_FLAGS_FAST)) {
|
||||
dma_sync_single_for_device(dd->dev, dma_addr_in, length,
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, dd->in_sg,
|
||||
nb_dma_sg_in, DMA_MEM_TO_DEV,
|
||||
if (dd->flags & AES_FLAGS_CFB8) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
} else if (dd->flags & AES_FLAGS_CFB16) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
} else {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
}
|
||||
|
||||
if (dd->flags & (AES_FLAGS_CFB8 | AES_FLAGS_CFB16 |
|
||||
AES_FLAGS_CFB32 | AES_FLAGS_CFB64)) {
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.dst_maxburst = 1;
|
||||
} else {
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
|
||||
}
|
||||
|
||||
dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
|
||||
dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
|
||||
|
||||
dd->flags |= AES_FLAGS_DMA;
|
||||
|
||||
sg_init_table(&sg[0], 1);
|
||||
sg_dma_address(&sg[0]) = dma_addr_in;
|
||||
sg_dma_len(&sg[0]) = length;
|
||||
|
||||
sg_init_table(&sg[1], 1);
|
||||
sg_dma_address(&sg[1]) = dma_addr_out;
|
||||
sg_dma_len(&sg[1]) = length;
|
||||
|
||||
in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
|
||||
1, DMA_MEM_TO_DEV,
|
||||
DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
|
||||
if (!in_desc)
|
||||
goto unmap_in;
|
||||
return -EINVAL;
|
||||
|
||||
/* callback not needed */
|
||||
|
||||
dd->nb_out_sg = atmel_aes_sg_length(dd->req, dd->out_sg);
|
||||
if (!dd->nb_out_sg)
|
||||
goto unmap_in;
|
||||
|
||||
nb_dma_sg_out = dma_map_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
|
||||
DMA_FROM_DEVICE);
|
||||
if (!nb_dma_sg_out)
|
||||
goto unmap_out;
|
||||
|
||||
out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, dd->out_sg,
|
||||
nb_dma_sg_out, DMA_DEV_TO_MEM,
|
||||
out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
|
||||
1, DMA_DEV_TO_MEM,
|
||||
DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
|
||||
if (!out_desc)
|
||||
goto unmap_out;
|
||||
return -EINVAL;
|
||||
|
||||
out_desc->callback = atmel_aes_dma_callback;
|
||||
out_desc->callback_param = dd;
|
||||
|
||||
dd->total -= dd->req->nbytes;
|
||||
|
||||
dmaengine_submit(out_desc);
|
||||
dma_async_issue_pending(dd->dma_lch_out.chan);
|
||||
|
||||
@ -311,15 +383,6 @@ static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
|
||||
dma_async_issue_pending(dd->dma_lch_in.chan);
|
||||
|
||||
return 0;
|
||||
|
||||
unmap_out:
|
||||
dma_unmap_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
|
||||
DMA_FROM_DEVICE);
|
||||
unmap_in:
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
|
||||
DMA_TO_DEVICE);
|
||||
exit_err:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
|
||||
@ -352,30 +415,66 @@ static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
|
||||
|
||||
static int atmel_aes_crypt_dma_start(struct atmel_aes_dev *dd)
|
||||
{
|
||||
int err;
|
||||
int err, fast = 0, in, out;
|
||||
size_t count;
|
||||
dma_addr_t addr_in, addr_out;
|
||||
|
||||
if (dd->flags & AES_FLAGS_CFB8) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
} else if (dd->flags & AES_FLAGS_CFB16) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
} else {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
if ((!dd->in_offset) && (!dd->out_offset)) {
|
||||
/* check for alignment */
|
||||
in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
|
||||
IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
|
||||
out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
|
||||
IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
|
||||
fast = in && out;
|
||||
|
||||
if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
|
||||
fast = 0;
|
||||
}
|
||||
|
||||
dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
|
||||
dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
|
||||
|
||||
dd->flags |= AES_FLAGS_DMA;
|
||||
err = atmel_aes_crypt_dma(dd);
|
||||
if (fast) {
|
||||
count = min(dd->total, sg_dma_len(dd->in_sg));
|
||||
count = min(count, sg_dma_len(dd->out_sg));
|
||||
|
||||
err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
if (!err) {
|
||||
dev_err(dd->dev, "dma_map_sg() error\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
err = dma_map_sg(dd->dev, dd->out_sg, 1,
|
||||
DMA_FROM_DEVICE);
|
||||
if (!err) {
|
||||
dev_err(dd->dev, "dma_map_sg() error\n");
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, 1,
|
||||
DMA_TO_DEVICE);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
addr_in = sg_dma_address(dd->in_sg);
|
||||
addr_out = sg_dma_address(dd->out_sg);
|
||||
|
||||
dd->flags |= AES_FLAGS_FAST;
|
||||
|
||||
} else {
|
||||
/* use cache buffers */
|
||||
count = atmel_aes_sg_copy(&dd->in_sg, &dd->in_offset,
|
||||
dd->buf_in, dd->buflen, dd->total, 0);
|
||||
|
||||
addr_in = dd->dma_addr_in;
|
||||
addr_out = dd->dma_addr_out;
|
||||
|
||||
dd->flags &= ~AES_FLAGS_FAST;
|
||||
}
|
||||
|
||||
dd->total -= count;
|
||||
|
||||
err = atmel_aes_crypt_dma(dd, addr_in, addr_out, count);
|
||||
|
||||
if (err && (dd->flags & AES_FLAGS_FAST)) {
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -410,6 +509,8 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
|
||||
valmr |= AES_MR_CFBS_32b;
|
||||
else if (dd->flags & AES_FLAGS_CFB64)
|
||||
valmr |= AES_MR_CFBS_64b;
|
||||
else if (dd->flags & AES_FLAGS_CFB128)
|
||||
valmr |= AES_MR_CFBS_128b;
|
||||
} else if (dd->flags & AES_FLAGS_OFB) {
|
||||
valmr |= AES_MR_OPMOD_OFB;
|
||||
} else if (dd->flags & AES_FLAGS_CTR) {
|
||||
@ -423,7 +524,7 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
|
||||
|
||||
if (dd->total > ATMEL_AES_DMA_THRESHOLD) {
|
||||
valmr |= AES_MR_SMOD_IDATAR0;
|
||||
if (dd->flags & AES_FLAGS_DUALBUFF)
|
||||
if (dd->caps.has_dualbuff)
|
||||
valmr |= AES_MR_DUALBUFF;
|
||||
} else {
|
||||
valmr |= AES_MR_SMOD_AUTO;
|
||||
@ -477,7 +578,9 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
|
||||
/* assign new request to device */
|
||||
dd->req = req;
|
||||
dd->total = req->nbytes;
|
||||
dd->in_offset = 0;
|
||||
dd->in_sg = req->src;
|
||||
dd->out_offset = 0;
|
||||
dd->out_sg = req->dst;
|
||||
|
||||
rctx = ablkcipher_request_ctx(req);
|
||||
@ -506,18 +609,86 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
|
||||
static int atmel_aes_crypt_dma_stop(struct atmel_aes_dev *dd)
|
||||
{
|
||||
int err = -EINVAL;
|
||||
size_t count;
|
||||
|
||||
if (dd->flags & AES_FLAGS_DMA) {
|
||||
dma_unmap_sg(dd->dev, dd->out_sg,
|
||||
dd->nb_out_sg, DMA_FROM_DEVICE);
|
||||
dma_unmap_sg(dd->dev, dd->in_sg,
|
||||
dd->nb_in_sg, DMA_TO_DEVICE);
|
||||
err = 0;
|
||||
if (dd->flags & AES_FLAGS_FAST) {
|
||||
dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
} else {
|
||||
dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
|
||||
dd->dma_size, DMA_FROM_DEVICE);
|
||||
|
||||
/* copy data */
|
||||
count = atmel_aes_sg_copy(&dd->out_sg, &dd->out_offset,
|
||||
dd->buf_out, dd->buflen, dd->dma_size, 1);
|
||||
if (count != dd->dma_size) {
|
||||
err = -EINVAL;
|
||||
pr_err("not all data converted: %u\n", count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
static int atmel_aes_buff_init(struct atmel_aes_dev *dd)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
|
||||
dd->buf_in = (void *)__get_free_pages(GFP_KERNEL, 0);
|
||||
dd->buf_out = (void *)__get_free_pages(GFP_KERNEL, 0);
|
||||
dd->buflen = PAGE_SIZE;
|
||||
dd->buflen &= ~(AES_BLOCK_SIZE - 1);
|
||||
|
||||
if (!dd->buf_in || !dd->buf_out) {
|
||||
dev_err(dd->dev, "unable to alloc pages.\n");
|
||||
goto err_alloc;
|
||||
}
|
||||
|
||||
/* MAP here */
|
||||
dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in,
|
||||
dd->buflen, DMA_TO_DEVICE);
|
||||
if (dma_mapping_error(dd->dev, dd->dma_addr_in)) {
|
||||
dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
|
||||
err = -EINVAL;
|
||||
goto err_map_in;
|
||||
}
|
||||
|
||||
dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out,
|
||||
dd->buflen, DMA_FROM_DEVICE);
|
||||
if (dma_mapping_error(dd->dev, dd->dma_addr_out)) {
|
||||
dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
|
||||
err = -EINVAL;
|
||||
goto err_map_out;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
err_map_out:
|
||||
dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
|
||||
DMA_TO_DEVICE);
|
||||
err_map_in:
|
||||
free_page((unsigned long)dd->buf_out);
|
||||
free_page((unsigned long)dd->buf_in);
|
||||
err_alloc:
|
||||
if (err)
|
||||
pr_err("error: %d\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void atmel_aes_buff_cleanup(struct atmel_aes_dev *dd)
|
||||
{
|
||||
dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
|
||||
DMA_FROM_DEVICE);
|
||||
dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
|
||||
DMA_TO_DEVICE);
|
||||
free_page((unsigned long)dd->buf_out);
|
||||
free_page((unsigned long)dd->buf_in);
|
||||
}
|
||||
|
||||
static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
|
||||
{
|
||||
struct atmel_aes_ctx *ctx = crypto_ablkcipher_ctx(
|
||||
@ -525,9 +696,30 @@ static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
|
||||
struct atmel_aes_reqctx *rctx = ablkcipher_request_ctx(req);
|
||||
struct atmel_aes_dev *dd;
|
||||
|
||||
if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of AES blocks\n");
|
||||
return -EINVAL;
|
||||
if (mode & AES_FLAGS_CFB8) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB8 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = CFB8_BLOCK_SIZE;
|
||||
} else if (mode & AES_FLAGS_CFB16) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB16 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = CFB16_BLOCK_SIZE;
|
||||
} else if (mode & AES_FLAGS_CFB32) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB32 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = CFB32_BLOCK_SIZE;
|
||||
} else {
|
||||
if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of AES blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = AES_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
dd = atmel_aes_find_dev(ctx);
|
||||
@ -551,14 +743,12 @@ static bool atmel_aes_filter(struct dma_chan *chan, void *slave)
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
|
||||
static int atmel_aes_dma_init(struct atmel_aes_dev *dd,
|
||||
struct crypto_platform_data *pdata)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
struct aes_platform_data *pdata;
|
||||
dma_cap_mask_t mask_in, mask_out;
|
||||
|
||||
pdata = dd->dev->platform_data;
|
||||
|
||||
if (pdata && pdata->dma_slave->txdata.dma_dev &&
|
||||
pdata->dma_slave->rxdata.dma_dev) {
|
||||
|
||||
@ -568,28 +758,38 @@ static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
|
||||
|
||||
dd->dma_lch_in.chan = dma_request_channel(mask_in,
|
||||
atmel_aes_filter, &pdata->dma_slave->rxdata);
|
||||
|
||||
if (!dd->dma_lch_in.chan)
|
||||
goto err_dma_in;
|
||||
|
||||
dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
|
||||
dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
|
||||
AES_IDATAR(0);
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_in.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.device_fc = false;
|
||||
|
||||
dma_cap_zero(mask_out);
|
||||
dma_cap_set(DMA_SLAVE, mask_out);
|
||||
dd->dma_lch_out.chan = dma_request_channel(mask_out,
|
||||
atmel_aes_filter, &pdata->dma_slave->txdata);
|
||||
|
||||
if (!dd->dma_lch_out.chan)
|
||||
goto err_dma_out;
|
||||
|
||||
dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
|
||||
dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
|
||||
AES_ODATAR(0);
|
||||
dd->dma_lch_out.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
|
||||
dd->dma_lch_out.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.device_fc = false;
|
||||
|
||||
return 0;
|
||||
@ -665,13 +865,13 @@ static int atmel_aes_ofb_decrypt(struct ablkcipher_request *req)
|
||||
static int atmel_aes_cfb_encrypt(struct ablkcipher_request *req)
|
||||
{
|
||||
return atmel_aes_crypt(req,
|
||||
AES_FLAGS_ENCRYPT | AES_FLAGS_CFB);
|
||||
AES_FLAGS_ENCRYPT | AES_FLAGS_CFB | AES_FLAGS_CFB128);
|
||||
}
|
||||
|
||||
static int atmel_aes_cfb_decrypt(struct ablkcipher_request *req)
|
||||
{
|
||||
return atmel_aes_crypt(req,
|
||||
AES_FLAGS_CFB);
|
||||
AES_FLAGS_CFB | AES_FLAGS_CFB128);
|
||||
}
|
||||
|
||||
static int atmel_aes_cfb64_encrypt(struct ablkcipher_request *req)
|
||||
@ -753,7 +953,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0xf,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -773,7 +973,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0xf,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -794,7 +994,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0xf,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -815,7 +1015,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0xf,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -836,7 +1036,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB32_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0x3,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -857,7 +1057,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB16_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0x1,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -899,7 +1099,7 @@ static struct crypto_alg aes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = AES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0xf,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -915,15 +1115,14 @@ static struct crypto_alg aes_algs[] = {
|
||||
},
|
||||
};
|
||||
|
||||
static struct crypto_alg aes_cfb64_alg[] = {
|
||||
{
|
||||
static struct crypto_alg aes_cfb64_alg = {
|
||||
.cra_name = "cfb64(aes)",
|
||||
.cra_driver_name = "atmel-cfb64-aes",
|
||||
.cra_priority = 100,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB64_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_aes_ctx),
|
||||
.cra_alignmask = 0x0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_aes_cra_init,
|
||||
@ -936,7 +1135,6 @@ static struct crypto_alg aes_cfb64_alg[] = {
|
||||
.encrypt = atmel_aes_cfb64_encrypt,
|
||||
.decrypt = atmel_aes_cfb64_decrypt,
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
static void atmel_aes_queue_task(unsigned long data)
|
||||
@ -969,7 +1167,14 @@ static void atmel_aes_done_task(unsigned long data)
|
||||
err = dd->err ? : err;
|
||||
|
||||
if (dd->total && !err) {
|
||||
err = atmel_aes_crypt_dma_start(dd);
|
||||
if (dd->flags & AES_FLAGS_FAST) {
|
||||
dd->in_sg = sg_next(dd->in_sg);
|
||||
dd->out_sg = sg_next(dd->out_sg);
|
||||
if (!dd->in_sg || !dd->out_sg)
|
||||
err = -EINVAL;
|
||||
}
|
||||
if (!err)
|
||||
err = atmel_aes_crypt_dma_start(dd);
|
||||
if (!err)
|
||||
return; /* DMA started. Not fininishing. */
|
||||
}
|
||||
@ -1003,8 +1208,8 @@ static void atmel_aes_unregister_algs(struct atmel_aes_dev *dd)
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
|
||||
crypto_unregister_alg(&aes_algs[i]);
|
||||
if (dd->hw_version >= 0x130)
|
||||
crypto_unregister_alg(&aes_cfb64_alg[0]);
|
||||
if (dd->caps.has_cfb64)
|
||||
crypto_unregister_alg(&aes_cfb64_alg);
|
||||
}
|
||||
|
||||
static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
|
||||
@ -1017,10 +1222,8 @@ static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
|
||||
goto err_aes_algs;
|
||||
}
|
||||
|
||||
atmel_aes_hw_version_init(dd);
|
||||
|
||||
if (dd->hw_version >= 0x130) {
|
||||
err = crypto_register_alg(&aes_cfb64_alg[0]);
|
||||
if (dd->caps.has_cfb64) {
|
||||
err = crypto_register_alg(&aes_cfb64_alg);
|
||||
if (err)
|
||||
goto err_aes_cfb64_alg;
|
||||
}
|
||||
@ -1036,10 +1239,32 @@ err_aes_algs:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
|
||||
{
|
||||
dd->caps.has_dualbuff = 0;
|
||||
dd->caps.has_cfb64 = 0;
|
||||
dd->caps.max_burst_size = 1;
|
||||
|
||||
/* keep only major version number */
|
||||
switch (dd->hw_version & 0xff0) {
|
||||
case 0x130:
|
||||
dd->caps.has_dualbuff = 1;
|
||||
dd->caps.has_cfb64 = 1;
|
||||
dd->caps.max_burst_size = 4;
|
||||
break;
|
||||
case 0x120:
|
||||
break;
|
||||
default:
|
||||
dev_warn(dd->dev,
|
||||
"Unmanaged aes version, set minimum capabilities\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_aes_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct atmel_aes_dev *aes_dd;
|
||||
struct aes_platform_data *pdata;
|
||||
struct crypto_platform_data *pdata;
|
||||
struct device *dev = &pdev->dev;
|
||||
struct resource *aes_res;
|
||||
unsigned long aes_phys_size;
|
||||
@ -1099,7 +1324,7 @@ static int atmel_aes_probe(struct platform_device *pdev)
|
||||
}
|
||||
|
||||
/* Initializing the clock */
|
||||
aes_dd->iclk = clk_get(&pdev->dev, NULL);
|
||||
aes_dd->iclk = clk_get(&pdev->dev, "aes_clk");
|
||||
if (IS_ERR(aes_dd->iclk)) {
|
||||
dev_err(dev, "clock intialization failed.\n");
|
||||
err = PTR_ERR(aes_dd->iclk);
|
||||
@ -1113,7 +1338,15 @@ static int atmel_aes_probe(struct platform_device *pdev)
|
||||
goto aes_io_err;
|
||||
}
|
||||
|
||||
err = atmel_aes_dma_init(aes_dd);
|
||||
atmel_aes_hw_version_init(aes_dd);
|
||||
|
||||
atmel_aes_get_cap(aes_dd);
|
||||
|
||||
err = atmel_aes_buff_init(aes_dd);
|
||||
if (err)
|
||||
goto err_aes_buff;
|
||||
|
||||
err = atmel_aes_dma_init(aes_dd, pdata);
|
||||
if (err)
|
||||
goto err_aes_dma;
|
||||
|
||||
@ -1135,6 +1368,8 @@ err_algs:
|
||||
spin_unlock(&atmel_aes.lock);
|
||||
atmel_aes_dma_cleanup(aes_dd);
|
||||
err_aes_dma:
|
||||
atmel_aes_buff_cleanup(aes_dd);
|
||||
err_aes_buff:
|
||||
iounmap(aes_dd->io_base);
|
||||
aes_io_err:
|
||||
clk_put(aes_dd->iclk);
|
||||
|
@ -14,10 +14,13 @@
|
||||
#define SHA_MR_MODE_MANUAL 0x0
|
||||
#define SHA_MR_MODE_AUTO 0x1
|
||||
#define SHA_MR_MODE_PDC 0x2
|
||||
#define SHA_MR_DUALBUFF (1 << 3)
|
||||
#define SHA_MR_PROCDLY (1 << 4)
|
||||
#define SHA_MR_ALGO_SHA1 (0 << 8)
|
||||
#define SHA_MR_ALGO_SHA256 (1 << 8)
|
||||
#define SHA_MR_ALGO_SHA384 (2 << 8)
|
||||
#define SHA_MR_ALGO_SHA512 (3 << 8)
|
||||
#define SHA_MR_ALGO_SHA224 (4 << 8)
|
||||
#define SHA_MR_DUALBUFF (1 << 16)
|
||||
|
||||
#define SHA_IER 0x10
|
||||
#define SHA_IDR 0x14
|
||||
@ -33,6 +36,8 @@
|
||||
#define SHA_ISR_URAT_MR (0x2 << 12)
|
||||
#define SHA_ISR_URAT_WO (0x5 << 12)
|
||||
|
||||
#define SHA_HW_VERSION 0xFC
|
||||
|
||||
#define SHA_TPR 0x108
|
||||
#define SHA_TCR 0x10C
|
||||
#define SHA_TNPR 0x118
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include <crypto/sha.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/platform_data/crypto-atmel.h>
|
||||
#include "atmel-sha-regs.h"
|
||||
|
||||
/* SHA flags */
|
||||
@ -52,11 +53,12 @@
|
||||
#define SHA_FLAGS_FINUP BIT(16)
|
||||
#define SHA_FLAGS_SG BIT(17)
|
||||
#define SHA_FLAGS_SHA1 BIT(18)
|
||||
#define SHA_FLAGS_SHA256 BIT(19)
|
||||
#define SHA_FLAGS_ERROR BIT(20)
|
||||
#define SHA_FLAGS_PAD BIT(21)
|
||||
|
||||
#define SHA_FLAGS_DUALBUFF BIT(24)
|
||||
#define SHA_FLAGS_SHA224 BIT(19)
|
||||
#define SHA_FLAGS_SHA256 BIT(20)
|
||||
#define SHA_FLAGS_SHA384 BIT(21)
|
||||
#define SHA_FLAGS_SHA512 BIT(22)
|
||||
#define SHA_FLAGS_ERROR BIT(23)
|
||||
#define SHA_FLAGS_PAD BIT(24)
|
||||
|
||||
#define SHA_OP_UPDATE 1
|
||||
#define SHA_OP_FINAL 2
|
||||
@ -65,6 +67,12 @@
|
||||
|
||||
#define ATMEL_SHA_DMA_THRESHOLD 56
|
||||
|
||||
struct atmel_sha_caps {
|
||||
bool has_dma;
|
||||
bool has_dualbuff;
|
||||
bool has_sha224;
|
||||
bool has_sha_384_512;
|
||||
};
|
||||
|
||||
struct atmel_sha_dev;
|
||||
|
||||
@ -73,8 +81,8 @@ struct atmel_sha_reqctx {
|
||||
unsigned long flags;
|
||||
unsigned long op;
|
||||
|
||||
u8 digest[SHA256_DIGEST_SIZE] __aligned(sizeof(u32));
|
||||
size_t digcnt;
|
||||
u8 digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32));
|
||||
u64 digcnt[2];
|
||||
size_t bufcnt;
|
||||
size_t buflen;
|
||||
dma_addr_t dma_addr;
|
||||
@ -84,6 +92,8 @@ struct atmel_sha_reqctx {
|
||||
unsigned int offset; /* offset in current sg */
|
||||
unsigned int total; /* total request */
|
||||
|
||||
size_t block_size;
|
||||
|
||||
u8 buffer[0] __aligned(sizeof(u32));
|
||||
};
|
||||
|
||||
@ -97,7 +107,12 @@ struct atmel_sha_ctx {
|
||||
|
||||
};
|
||||
|
||||
#define ATMEL_SHA_QUEUE_LENGTH 1
|
||||
#define ATMEL_SHA_QUEUE_LENGTH 50
|
||||
|
||||
struct atmel_sha_dma {
|
||||
struct dma_chan *chan;
|
||||
struct dma_slave_config dma_conf;
|
||||
};
|
||||
|
||||
struct atmel_sha_dev {
|
||||
struct list_head list;
|
||||
@ -114,6 +129,12 @@ struct atmel_sha_dev {
|
||||
unsigned long flags;
|
||||
struct crypto_queue queue;
|
||||
struct ahash_request *req;
|
||||
|
||||
struct atmel_sha_dma dma_lch_in;
|
||||
|
||||
struct atmel_sha_caps caps;
|
||||
|
||||
u32 hw_version;
|
||||
};
|
||||
|
||||
struct atmel_sha_drv {
|
||||
@ -137,14 +158,6 @@ static inline void atmel_sha_write(struct atmel_sha_dev *dd,
|
||||
writel_relaxed(value, dd->io_base + offset);
|
||||
}
|
||||
|
||||
static void atmel_sha_dualbuff_test(struct atmel_sha_dev *dd)
|
||||
{
|
||||
atmel_sha_write(dd, SHA_MR, SHA_MR_DUALBUFF);
|
||||
|
||||
if (atmel_sha_read(dd, SHA_MR) & SHA_MR_DUALBUFF)
|
||||
dd->flags |= SHA_FLAGS_DUALBUFF;
|
||||
}
|
||||
|
||||
static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
|
||||
{
|
||||
size_t count;
|
||||
@ -176,31 +189,58 @@ static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
|
||||
}
|
||||
|
||||
/*
|
||||
* The purpose of this padding is to ensure that the padded message
|
||||
* is a multiple of 512 bits. The bit "1" is appended at the end of
|
||||
* the message followed by "padlen-1" zero bits. Then a 64 bits block
|
||||
* equals to the message length in bits is appended.
|
||||
* The purpose of this padding is to ensure that the padded message is a
|
||||
* multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512).
|
||||
* The bit "1" is appended at the end of the message followed by
|
||||
* "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or
|
||||
* 128 bits block (SHA384/SHA512) equals to the message length in bits
|
||||
* is appended.
|
||||
*
|
||||
* padlen is calculated as followed:
|
||||
* For SHA1/SHA224/SHA256, padlen is calculated as followed:
|
||||
* - if message length < 56 bytes then padlen = 56 - message length
|
||||
* - else padlen = 64 + 56 - message length
|
||||
*
|
||||
* For SHA384/SHA512, padlen is calculated as followed:
|
||||
* - if message length < 112 bytes then padlen = 112 - message length
|
||||
* - else padlen = 128 + 112 - message length
|
||||
*/
|
||||
static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
|
||||
{
|
||||
unsigned int index, padlen;
|
||||
u64 bits;
|
||||
u64 size;
|
||||
u64 bits[2];
|
||||
u64 size[2];
|
||||
|
||||
bits = (ctx->bufcnt + ctx->digcnt + length) << 3;
|
||||
size = cpu_to_be64(bits);
|
||||
size[0] = ctx->digcnt[0];
|
||||
size[1] = ctx->digcnt[1];
|
||||
|
||||
index = ctx->bufcnt & 0x3f;
|
||||
padlen = (index < 56) ? (56 - index) : ((64+56) - index);
|
||||
*(ctx->buffer + ctx->bufcnt) = 0x80;
|
||||
memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
|
||||
memcpy(ctx->buffer + ctx->bufcnt + padlen, &size, 8);
|
||||
ctx->bufcnt += padlen + 8;
|
||||
ctx->flags |= SHA_FLAGS_PAD;
|
||||
size[0] += ctx->bufcnt;
|
||||
if (size[0] < ctx->bufcnt)
|
||||
size[1]++;
|
||||
|
||||
size[0] += length;
|
||||
if (size[0] < length)
|
||||
size[1]++;
|
||||
|
||||
bits[1] = cpu_to_be64(size[0] << 3);
|
||||
bits[0] = cpu_to_be64(size[1] << 3 | size[0] >> 61);
|
||||
|
||||
if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
|
||||
index = ctx->bufcnt & 0x7f;
|
||||
padlen = (index < 112) ? (112 - index) : ((128+112) - index);
|
||||
*(ctx->buffer + ctx->bufcnt) = 0x80;
|
||||
memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
|
||||
memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
|
||||
ctx->bufcnt += padlen + 16;
|
||||
ctx->flags |= SHA_FLAGS_PAD;
|
||||
} else {
|
||||
index = ctx->bufcnt & 0x3f;
|
||||
padlen = (index < 56) ? (56 - index) : ((64+56) - index);
|
||||
*(ctx->buffer + ctx->bufcnt) = 0x80;
|
||||
memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
|
||||
memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
|
||||
ctx->bufcnt += padlen + 8;
|
||||
ctx->flags |= SHA_FLAGS_PAD;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_sha_init(struct ahash_request *req)
|
||||
@ -231,13 +271,35 @@ static int atmel_sha_init(struct ahash_request *req)
|
||||
dev_dbg(dd->dev, "init: digest size: %d\n",
|
||||
crypto_ahash_digestsize(tfm));
|
||||
|
||||
if (crypto_ahash_digestsize(tfm) == SHA1_DIGEST_SIZE)
|
||||
switch (crypto_ahash_digestsize(tfm)) {
|
||||
case SHA1_DIGEST_SIZE:
|
||||
ctx->flags |= SHA_FLAGS_SHA1;
|
||||
else if (crypto_ahash_digestsize(tfm) == SHA256_DIGEST_SIZE)
|
||||
ctx->block_size = SHA1_BLOCK_SIZE;
|
||||
break;
|
||||
case SHA224_DIGEST_SIZE:
|
||||
ctx->flags |= SHA_FLAGS_SHA224;
|
||||
ctx->block_size = SHA224_BLOCK_SIZE;
|
||||
break;
|
||||
case SHA256_DIGEST_SIZE:
|
||||
ctx->flags |= SHA_FLAGS_SHA256;
|
||||
ctx->block_size = SHA256_BLOCK_SIZE;
|
||||
break;
|
||||
case SHA384_DIGEST_SIZE:
|
||||
ctx->flags |= SHA_FLAGS_SHA384;
|
||||
ctx->block_size = SHA384_BLOCK_SIZE;
|
||||
break;
|
||||
case SHA512_DIGEST_SIZE:
|
||||
ctx->flags |= SHA_FLAGS_SHA512;
|
||||
ctx->block_size = SHA512_BLOCK_SIZE;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
ctx->bufcnt = 0;
|
||||
ctx->digcnt = 0;
|
||||
ctx->digcnt[0] = 0;
|
||||
ctx->digcnt[1] = 0;
|
||||
ctx->buflen = SHA_BUFFER_LEN;
|
||||
|
||||
return 0;
|
||||
@ -249,19 +311,28 @@ static void atmel_sha_write_ctrl(struct atmel_sha_dev *dd, int dma)
|
||||
u32 valcr = 0, valmr = SHA_MR_MODE_AUTO;
|
||||
|
||||
if (likely(dma)) {
|
||||
atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
|
||||
if (!dd->caps.has_dma)
|
||||
atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
|
||||
valmr = SHA_MR_MODE_PDC;
|
||||
if (dd->flags & SHA_FLAGS_DUALBUFF)
|
||||
valmr = SHA_MR_DUALBUFF;
|
||||
if (dd->caps.has_dualbuff)
|
||||
valmr |= SHA_MR_DUALBUFF;
|
||||
} else {
|
||||
atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
|
||||
}
|
||||
|
||||
if (ctx->flags & SHA_FLAGS_SHA256)
|
||||
if (ctx->flags & SHA_FLAGS_SHA1)
|
||||
valmr |= SHA_MR_ALGO_SHA1;
|
||||
else if (ctx->flags & SHA_FLAGS_SHA224)
|
||||
valmr |= SHA_MR_ALGO_SHA224;
|
||||
else if (ctx->flags & SHA_FLAGS_SHA256)
|
||||
valmr |= SHA_MR_ALGO_SHA256;
|
||||
else if (ctx->flags & SHA_FLAGS_SHA384)
|
||||
valmr |= SHA_MR_ALGO_SHA384;
|
||||
else if (ctx->flags & SHA_FLAGS_SHA512)
|
||||
valmr |= SHA_MR_ALGO_SHA512;
|
||||
|
||||
/* Setting CR_FIRST only for the first iteration */
|
||||
if (!ctx->digcnt)
|
||||
if (!(ctx->digcnt[0] || ctx->digcnt[1]))
|
||||
valcr = SHA_CR_FIRST;
|
||||
|
||||
atmel_sha_write(dd, SHA_CR, valcr);
|
||||
@ -275,13 +346,15 @@ static int atmel_sha_xmit_cpu(struct atmel_sha_dev *dd, const u8 *buf,
|
||||
int count, len32;
|
||||
const u32 *buffer = (const u32 *)buf;
|
||||
|
||||
dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n",
|
||||
ctx->digcnt, length, final);
|
||||
dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
|
||||
ctx->digcnt[1], ctx->digcnt[0], length, final);
|
||||
|
||||
atmel_sha_write_ctrl(dd, 0);
|
||||
|
||||
/* should be non-zero before next lines to disable clocks later */
|
||||
ctx->digcnt += length;
|
||||
ctx->digcnt[0] += length;
|
||||
if (ctx->digcnt[0] < length)
|
||||
ctx->digcnt[1]++;
|
||||
|
||||
if (final)
|
||||
dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
|
||||
@ -302,8 +375,8 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
|
||||
struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
|
||||
int len32;
|
||||
|
||||
dev_dbg(dd->dev, "xmit_pdc: digcnt: %d, length: %d, final: %d\n",
|
||||
ctx->digcnt, length1, final);
|
||||
dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
|
||||
ctx->digcnt[1], ctx->digcnt[0], length1, final);
|
||||
|
||||
len32 = DIV_ROUND_UP(length1, sizeof(u32));
|
||||
atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS);
|
||||
@ -317,7 +390,9 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
|
||||
atmel_sha_write_ctrl(dd, 1);
|
||||
|
||||
/* should be non-zero before next lines to disable clocks later */
|
||||
ctx->digcnt += length1;
|
||||
ctx->digcnt[0] += length1;
|
||||
if (ctx->digcnt[0] < length1)
|
||||
ctx->digcnt[1]++;
|
||||
|
||||
if (final)
|
||||
dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
|
||||
@ -330,6 +405,86 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
static void atmel_sha_dma_callback(void *data)
|
||||
{
|
||||
struct atmel_sha_dev *dd = data;
|
||||
|
||||
/* dma_lch_in - completed - wait DATRDY */
|
||||
atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
|
||||
}
|
||||
|
||||
static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
|
||||
size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
|
||||
{
|
||||
struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
|
||||
struct dma_async_tx_descriptor *in_desc;
|
||||
struct scatterlist sg[2];
|
||||
|
||||
dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
|
||||
ctx->digcnt[1], ctx->digcnt[0], length1, final);
|
||||
|
||||
if (ctx->flags & (SHA_FLAGS_SHA1 | SHA_FLAGS_SHA224 |
|
||||
SHA_FLAGS_SHA256)) {
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 16;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 16;
|
||||
} else {
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 32;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 32;
|
||||
}
|
||||
|
||||
dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
|
||||
|
||||
if (length2) {
|
||||
sg_init_table(sg, 2);
|
||||
sg_dma_address(&sg[0]) = dma_addr1;
|
||||
sg_dma_len(&sg[0]) = length1;
|
||||
sg_dma_address(&sg[1]) = dma_addr2;
|
||||
sg_dma_len(&sg[1]) = length2;
|
||||
in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 2,
|
||||
DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
} else {
|
||||
sg_init_table(sg, 1);
|
||||
sg_dma_address(&sg[0]) = dma_addr1;
|
||||
sg_dma_len(&sg[0]) = length1;
|
||||
in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 1,
|
||||
DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
}
|
||||
if (!in_desc)
|
||||
return -EINVAL;
|
||||
|
||||
in_desc->callback = atmel_sha_dma_callback;
|
||||
in_desc->callback_param = dd;
|
||||
|
||||
atmel_sha_write_ctrl(dd, 1);
|
||||
|
||||
/* should be non-zero before next lines to disable clocks later */
|
||||
ctx->digcnt[0] += length1;
|
||||
if (ctx->digcnt[0] < length1)
|
||||
ctx->digcnt[1]++;
|
||||
|
||||
if (final)
|
||||
dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
|
||||
|
||||
dd->flags |= SHA_FLAGS_DMA_ACTIVE;
|
||||
|
||||
/* Start DMA transfer */
|
||||
dmaengine_submit(in_desc);
|
||||
dma_async_issue_pending(dd->dma_lch_in.chan);
|
||||
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
static int atmel_sha_xmit_start(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
|
||||
size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
|
||||
{
|
||||
if (dd->caps.has_dma)
|
||||
return atmel_sha_xmit_dma(dd, dma_addr1, length1,
|
||||
dma_addr2, length2, final);
|
||||
else
|
||||
return atmel_sha_xmit_pdc(dd, dma_addr1, length1,
|
||||
dma_addr2, length2, final);
|
||||
}
|
||||
|
||||
static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
|
||||
{
|
||||
struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
|
||||
@ -337,7 +492,6 @@ static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
|
||||
|
||||
atmel_sha_append_sg(ctx);
|
||||
atmel_sha_fill_padding(ctx, 0);
|
||||
|
||||
bufcnt = ctx->bufcnt;
|
||||
ctx->bufcnt = 0;
|
||||
|
||||
@ -349,17 +503,17 @@ static int atmel_sha_xmit_dma_map(struct atmel_sha_dev *dd,
|
||||
size_t length, int final)
|
||||
{
|
||||
ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
|
||||
ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
|
||||
ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
|
||||
if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
|
||||
dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
|
||||
SHA1_BLOCK_SIZE);
|
||||
ctx->block_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ctx->flags &= ~SHA_FLAGS_SG;
|
||||
|
||||
/* next call does not fail... so no unmap in the case of error */
|
||||
return atmel_sha_xmit_pdc(dd, ctx->dma_addr, length, 0, 0, final);
|
||||
return atmel_sha_xmit_start(dd, ctx->dma_addr, length, 0, 0, final);
|
||||
}
|
||||
|
||||
static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
|
||||
@ -372,8 +526,8 @@ static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
|
||||
|
||||
final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
|
||||
|
||||
dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n",
|
||||
ctx->bufcnt, ctx->digcnt, final);
|
||||
dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
|
||||
ctx->bufcnt, ctx->digcnt[1], ctx->digcnt[0], final);
|
||||
|
||||
if (final)
|
||||
atmel_sha_fill_padding(ctx, 0);
|
||||
@ -400,30 +554,25 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
if (ctx->bufcnt || ctx->offset)
|
||||
return atmel_sha_update_dma_slow(dd);
|
||||
|
||||
dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n",
|
||||
ctx->digcnt, ctx->bufcnt, ctx->total);
|
||||
dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
|
||||
ctx->digcnt[1], ctx->digcnt[0], ctx->bufcnt, ctx->total);
|
||||
|
||||
sg = ctx->sg;
|
||||
|
||||
if (!IS_ALIGNED(sg->offset, sizeof(u32)))
|
||||
return atmel_sha_update_dma_slow(dd);
|
||||
|
||||
if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, SHA1_BLOCK_SIZE))
|
||||
/* size is not SHA1_BLOCK_SIZE aligned */
|
||||
if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->block_size))
|
||||
/* size is not ctx->block_size aligned */
|
||||
return atmel_sha_update_dma_slow(dd);
|
||||
|
||||
length = min(ctx->total, sg->length);
|
||||
|
||||
if (sg_is_last(sg)) {
|
||||
if (!(ctx->flags & SHA_FLAGS_FINUP)) {
|
||||
/* not last sg must be SHA1_BLOCK_SIZE aligned */
|
||||
tail = length & (SHA1_BLOCK_SIZE - 1);
|
||||
/* not last sg must be ctx->block_size aligned */
|
||||
tail = length & (ctx->block_size - 1);
|
||||
length -= tail;
|
||||
if (length == 0) {
|
||||
/* offset where to start slow */
|
||||
ctx->offset = length;
|
||||
return atmel_sha_update_dma_slow(dd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -434,7 +583,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
|
||||
/* Add padding */
|
||||
if (final) {
|
||||
tail = length & (SHA1_BLOCK_SIZE - 1);
|
||||
tail = length & (ctx->block_size - 1);
|
||||
length -= tail;
|
||||
ctx->total += tail;
|
||||
ctx->offset = length; /* offset where to start slow */
|
||||
@ -445,10 +594,10 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
atmel_sha_fill_padding(ctx, length);
|
||||
|
||||
ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
|
||||
ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
|
||||
ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
|
||||
if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
|
||||
dev_err(dd->dev, "dma %u bytes error\n",
|
||||
ctx->buflen + SHA1_BLOCK_SIZE);
|
||||
ctx->buflen + ctx->block_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -456,7 +605,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
ctx->flags &= ~SHA_FLAGS_SG;
|
||||
count = ctx->bufcnt;
|
||||
ctx->bufcnt = 0;
|
||||
return atmel_sha_xmit_pdc(dd, ctx->dma_addr, count, 0,
|
||||
return atmel_sha_xmit_start(dd, ctx->dma_addr, count, 0,
|
||||
0, final);
|
||||
} else {
|
||||
ctx->sg = sg;
|
||||
@ -470,7 +619,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
|
||||
count = ctx->bufcnt;
|
||||
ctx->bufcnt = 0;
|
||||
return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg),
|
||||
return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg),
|
||||
length, ctx->dma_addr, count, final);
|
||||
}
|
||||
}
|
||||
@ -483,7 +632,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
|
||||
ctx->flags |= SHA_FLAGS_SG;
|
||||
|
||||
/* next call does not fail... so no unmap in the case of error */
|
||||
return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg), length, 0,
|
||||
return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg), length, 0,
|
||||
0, final);
|
||||
}
|
||||
|
||||
@ -498,12 +647,13 @@ static int atmel_sha_update_dma_stop(struct atmel_sha_dev *dd)
|
||||
if (ctx->sg)
|
||||
ctx->offset = 0;
|
||||
}
|
||||
if (ctx->flags & SHA_FLAGS_PAD)
|
||||
if (ctx->flags & SHA_FLAGS_PAD) {
|
||||
dma_unmap_single(dd->dev, ctx->dma_addr,
|
||||
ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
|
||||
ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
|
||||
}
|
||||
} else {
|
||||
dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen +
|
||||
SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
|
||||
ctx->block_size, DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -515,8 +665,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
|
||||
struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
|
||||
int err;
|
||||
|
||||
dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n",
|
||||
ctx->total, ctx->digcnt, (ctx->flags & SHA_FLAGS_FINUP) != 0);
|
||||
dev_dbg(dd->dev, "update_req: total: %u, digcnt: 0x%llx 0x%llx\n",
|
||||
ctx->total, ctx->digcnt[1], ctx->digcnt[0]);
|
||||
|
||||
if (ctx->flags & SHA_FLAGS_CPU)
|
||||
err = atmel_sha_update_cpu(dd);
|
||||
@ -524,8 +674,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
|
||||
err = atmel_sha_update_dma_start(dd);
|
||||
|
||||
/* wait for dma completion before can take more data */
|
||||
dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n",
|
||||
err, ctx->digcnt);
|
||||
dev_dbg(dd->dev, "update: err: %d, digcnt: 0x%llx 0%llx\n",
|
||||
err, ctx->digcnt[1], ctx->digcnt[0]);
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -562,12 +712,21 @@ static void atmel_sha_copy_hash(struct ahash_request *req)
|
||||
u32 *hash = (u32 *)ctx->digest;
|
||||
int i;
|
||||
|
||||
if (likely(ctx->flags & SHA_FLAGS_SHA1))
|
||||
if (ctx->flags & SHA_FLAGS_SHA1)
|
||||
for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
|
||||
hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
|
||||
else
|
||||
else if (ctx->flags & SHA_FLAGS_SHA224)
|
||||
for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(u32); i++)
|
||||
hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
|
||||
else if (ctx->flags & SHA_FLAGS_SHA256)
|
||||
for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(u32); i++)
|
||||
hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
|
||||
else if (ctx->flags & SHA_FLAGS_SHA384)
|
||||
for (i = 0; i < SHA384_DIGEST_SIZE / sizeof(u32); i++)
|
||||
hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
|
||||
else
|
||||
for (i = 0; i < SHA512_DIGEST_SIZE / sizeof(u32); i++)
|
||||
hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
|
||||
}
|
||||
|
||||
static void atmel_sha_copy_ready_hash(struct ahash_request *req)
|
||||
@ -577,10 +736,16 @@ static void atmel_sha_copy_ready_hash(struct ahash_request *req)
|
||||
if (!req->result)
|
||||
return;
|
||||
|
||||
if (likely(ctx->flags & SHA_FLAGS_SHA1))
|
||||
if (ctx->flags & SHA_FLAGS_SHA1)
|
||||
memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE);
|
||||
else
|
||||
else if (ctx->flags & SHA_FLAGS_SHA224)
|
||||
memcpy(req->result, ctx->digest, SHA224_DIGEST_SIZE);
|
||||
else if (ctx->flags & SHA_FLAGS_SHA256)
|
||||
memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE);
|
||||
else if (ctx->flags & SHA_FLAGS_SHA384)
|
||||
memcpy(req->result, ctx->digest, SHA384_DIGEST_SIZE);
|
||||
else
|
||||
memcpy(req->result, ctx->digest, SHA512_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
static int atmel_sha_finish(struct ahash_request *req)
|
||||
@ -589,11 +754,11 @@ static int atmel_sha_finish(struct ahash_request *req)
|
||||
struct atmel_sha_dev *dd = ctx->dd;
|
||||
int err = 0;
|
||||
|
||||
if (ctx->digcnt)
|
||||
if (ctx->digcnt[0] || ctx->digcnt[1])
|
||||
atmel_sha_copy_ready_hash(req);
|
||||
|
||||
dev_dbg(dd->dev, "digcnt: %d, bufcnt: %d\n", ctx->digcnt,
|
||||
ctx->bufcnt);
|
||||
dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
|
||||
ctx->digcnt[0], ctx->bufcnt);
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -628,9 +793,8 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
|
||||
{
|
||||
clk_prepare_enable(dd->iclk);
|
||||
|
||||
if (SHA_FLAGS_INIT & dd->flags) {
|
||||
if (!(SHA_FLAGS_INIT & dd->flags)) {
|
||||
atmel_sha_write(dd, SHA_CR, SHA_CR_SWRST);
|
||||
atmel_sha_dualbuff_test(dd);
|
||||
dd->flags |= SHA_FLAGS_INIT;
|
||||
dd->err = 0;
|
||||
}
|
||||
@ -638,6 +802,23 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int atmel_sha_get_version(struct atmel_sha_dev *dd)
|
||||
{
|
||||
return atmel_sha_read(dd, SHA_HW_VERSION) & 0x00000fff;
|
||||
}
|
||||
|
||||
static void atmel_sha_hw_version_init(struct atmel_sha_dev *dd)
|
||||
{
|
||||
atmel_sha_hw_init(dd);
|
||||
|
||||
dd->hw_version = atmel_sha_get_version(dd);
|
||||
|
||||
dev_info(dd->dev,
|
||||
"version: 0x%x\n", dd->hw_version);
|
||||
|
||||
clk_disable_unprepare(dd->iclk);
|
||||
}
|
||||
|
||||
static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
|
||||
struct ahash_request *req)
|
||||
{
|
||||
@ -682,10 +863,9 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
|
||||
|
||||
if (ctx->op == SHA_OP_UPDATE) {
|
||||
err = atmel_sha_update_req(dd);
|
||||
if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP)) {
|
||||
if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
|
||||
/* no final() after finup() */
|
||||
err = atmel_sha_final_req(dd);
|
||||
}
|
||||
} else if (ctx->op == SHA_OP_FINAL) {
|
||||
err = atmel_sha_final_req(dd);
|
||||
}
|
||||
@ -808,7 +988,7 @@ static int atmel_sha_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base)
|
||||
}
|
||||
crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
|
||||
sizeof(struct atmel_sha_reqctx) +
|
||||
SHA_BUFFER_LEN + SHA256_BLOCK_SIZE);
|
||||
SHA_BUFFER_LEN + SHA512_BLOCK_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -826,7 +1006,7 @@ static void atmel_sha_cra_exit(struct crypto_tfm *tfm)
|
||||
tctx->fallback = NULL;
|
||||
}
|
||||
|
||||
static struct ahash_alg sha_algs[] = {
|
||||
static struct ahash_alg sha_1_256_algs[] = {
|
||||
{
|
||||
.init = atmel_sha_init,
|
||||
.update = atmel_sha_update,
|
||||
@ -875,6 +1055,79 @@ static struct ahash_alg sha_algs[] = {
|
||||
},
|
||||
};
|
||||
|
||||
static struct ahash_alg sha_224_alg = {
|
||||
.init = atmel_sha_init,
|
||||
.update = atmel_sha_update,
|
||||
.final = atmel_sha_final,
|
||||
.finup = atmel_sha_finup,
|
||||
.digest = atmel_sha_digest,
|
||||
.halg = {
|
||||
.digestsize = SHA224_DIGEST_SIZE,
|
||||
.base = {
|
||||
.cra_name = "sha224",
|
||||
.cra_driver_name = "atmel-sha224",
|
||||
.cra_priority = 100,
|
||||
.cra_flags = CRYPTO_ALG_ASYNC |
|
||||
CRYPTO_ALG_NEED_FALLBACK,
|
||||
.cra_blocksize = SHA224_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_sha_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_sha_cra_init,
|
||||
.cra_exit = atmel_sha_cra_exit,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static struct ahash_alg sha_384_512_algs[] = {
|
||||
{
|
||||
.init = atmel_sha_init,
|
||||
.update = atmel_sha_update,
|
||||
.final = atmel_sha_final,
|
||||
.finup = atmel_sha_finup,
|
||||
.digest = atmel_sha_digest,
|
||||
.halg = {
|
||||
.digestsize = SHA384_DIGEST_SIZE,
|
||||
.base = {
|
||||
.cra_name = "sha384",
|
||||
.cra_driver_name = "atmel-sha384",
|
||||
.cra_priority = 100,
|
||||
.cra_flags = CRYPTO_ALG_ASYNC |
|
||||
CRYPTO_ALG_NEED_FALLBACK,
|
||||
.cra_blocksize = SHA384_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_sha_ctx),
|
||||
.cra_alignmask = 0x3,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_sha_cra_init,
|
||||
.cra_exit = atmel_sha_cra_exit,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
.init = atmel_sha_init,
|
||||
.update = atmel_sha_update,
|
||||
.final = atmel_sha_final,
|
||||
.finup = atmel_sha_finup,
|
||||
.digest = atmel_sha_digest,
|
||||
.halg = {
|
||||
.digestsize = SHA512_DIGEST_SIZE,
|
||||
.base = {
|
||||
.cra_name = "sha512",
|
||||
.cra_driver_name = "atmel-sha512",
|
||||
.cra_priority = 100,
|
||||
.cra_flags = CRYPTO_ALG_ASYNC |
|
||||
CRYPTO_ALG_NEED_FALLBACK,
|
||||
.cra_blocksize = SHA512_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_sha_ctx),
|
||||
.cra_alignmask = 0x3,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_sha_cra_init,
|
||||
.cra_exit = atmel_sha_cra_exit,
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
static void atmel_sha_done_task(unsigned long data)
|
||||
{
|
||||
struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
|
||||
@ -941,32 +1194,142 @@ static void atmel_sha_unregister_algs(struct atmel_sha_dev *dd)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(sha_algs); i++)
|
||||
crypto_unregister_ahash(&sha_algs[i]);
|
||||
for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++)
|
||||
crypto_unregister_ahash(&sha_1_256_algs[i]);
|
||||
|
||||
if (dd->caps.has_sha224)
|
||||
crypto_unregister_ahash(&sha_224_alg);
|
||||
|
||||
if (dd->caps.has_sha_384_512) {
|
||||
for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++)
|
||||
crypto_unregister_ahash(&sha_384_512_algs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_sha_register_algs(struct atmel_sha_dev *dd)
|
||||
{
|
||||
int err, i, j;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(sha_algs); i++) {
|
||||
err = crypto_register_ahash(&sha_algs[i]);
|
||||
for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++) {
|
||||
err = crypto_register_ahash(&sha_1_256_algs[i]);
|
||||
if (err)
|
||||
goto err_sha_algs;
|
||||
goto err_sha_1_256_algs;
|
||||
}
|
||||
|
||||
if (dd->caps.has_sha224) {
|
||||
err = crypto_register_ahash(&sha_224_alg);
|
||||
if (err)
|
||||
goto err_sha_224_algs;
|
||||
}
|
||||
|
||||
if (dd->caps.has_sha_384_512) {
|
||||
for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++) {
|
||||
err = crypto_register_ahash(&sha_384_512_algs[i]);
|
||||
if (err)
|
||||
goto err_sha_384_512_algs;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
err_sha_algs:
|
||||
err_sha_384_512_algs:
|
||||
for (j = 0; j < i; j++)
|
||||
crypto_unregister_ahash(&sha_algs[j]);
|
||||
crypto_unregister_ahash(&sha_384_512_algs[j]);
|
||||
crypto_unregister_ahash(&sha_224_alg);
|
||||
err_sha_224_algs:
|
||||
i = ARRAY_SIZE(sha_1_256_algs);
|
||||
err_sha_1_256_algs:
|
||||
for (j = 0; j < i; j++)
|
||||
crypto_unregister_ahash(&sha_1_256_algs[j]);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool atmel_sha_filter(struct dma_chan *chan, void *slave)
|
||||
{
|
||||
struct at_dma_slave *sl = slave;
|
||||
|
||||
if (sl && sl->dma_dev == chan->device->dev) {
|
||||
chan->private = sl;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_sha_dma_init(struct atmel_sha_dev *dd,
|
||||
struct crypto_platform_data *pdata)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
dma_cap_mask_t mask_in;
|
||||
|
||||
if (pdata && pdata->dma_slave->rxdata.dma_dev) {
|
||||
/* Try to grab DMA channel */
|
||||
dma_cap_zero(mask_in);
|
||||
dma_cap_set(DMA_SLAVE, mask_in);
|
||||
|
||||
dd->dma_lch_in.chan = dma_request_channel(mask_in,
|
||||
atmel_sha_filter, &pdata->dma_slave->rxdata);
|
||||
|
||||
if (!dd->dma_lch_in.chan)
|
||||
return err;
|
||||
|
||||
dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
|
||||
dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
|
||||
SHA_REG_DIN(0);
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.device_fc = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static void atmel_sha_dma_cleanup(struct atmel_sha_dev *dd)
|
||||
{
|
||||
dma_release_channel(dd->dma_lch_in.chan);
|
||||
}
|
||||
|
||||
static void atmel_sha_get_cap(struct atmel_sha_dev *dd)
|
||||
{
|
||||
|
||||
dd->caps.has_dma = 0;
|
||||
dd->caps.has_dualbuff = 0;
|
||||
dd->caps.has_sha224 = 0;
|
||||
dd->caps.has_sha_384_512 = 0;
|
||||
|
||||
/* keep only major version number */
|
||||
switch (dd->hw_version & 0xff0) {
|
||||
case 0x410:
|
||||
dd->caps.has_dma = 1;
|
||||
dd->caps.has_dualbuff = 1;
|
||||
dd->caps.has_sha224 = 1;
|
||||
dd->caps.has_sha_384_512 = 1;
|
||||
break;
|
||||
case 0x400:
|
||||
dd->caps.has_dma = 1;
|
||||
dd->caps.has_dualbuff = 1;
|
||||
dd->caps.has_sha224 = 1;
|
||||
break;
|
||||
case 0x320:
|
||||
break;
|
||||
default:
|
||||
dev_warn(dd->dev,
|
||||
"Unmanaged sha version, set minimum capabilities\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_sha_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct atmel_sha_dev *sha_dd;
|
||||
struct crypto_platform_data *pdata;
|
||||
struct device *dev = &pdev->dev;
|
||||
struct resource *sha_res;
|
||||
unsigned long sha_phys_size;
|
||||
@ -1018,7 +1381,7 @@ static int atmel_sha_probe(struct platform_device *pdev)
|
||||
}
|
||||
|
||||
/* Initializing the clock */
|
||||
sha_dd->iclk = clk_get(&pdev->dev, NULL);
|
||||
sha_dd->iclk = clk_get(&pdev->dev, "sha_clk");
|
||||
if (IS_ERR(sha_dd->iclk)) {
|
||||
dev_err(dev, "clock intialization failed.\n");
|
||||
err = PTR_ERR(sha_dd->iclk);
|
||||
@ -1032,6 +1395,22 @@ static int atmel_sha_probe(struct platform_device *pdev)
|
||||
goto sha_io_err;
|
||||
}
|
||||
|
||||
atmel_sha_hw_version_init(sha_dd);
|
||||
|
||||
atmel_sha_get_cap(sha_dd);
|
||||
|
||||
if (sha_dd->caps.has_dma) {
|
||||
pdata = pdev->dev.platform_data;
|
||||
if (!pdata) {
|
||||
dev_err(&pdev->dev, "platform data not available\n");
|
||||
err = -ENXIO;
|
||||
goto err_pdata;
|
||||
}
|
||||
err = atmel_sha_dma_init(sha_dd, pdata);
|
||||
if (err)
|
||||
goto err_sha_dma;
|
||||
}
|
||||
|
||||
spin_lock(&atmel_sha.lock);
|
||||
list_add_tail(&sha_dd->list, &atmel_sha.dev_list);
|
||||
spin_unlock(&atmel_sha.lock);
|
||||
@ -1048,6 +1427,10 @@ err_algs:
|
||||
spin_lock(&atmel_sha.lock);
|
||||
list_del(&sha_dd->list);
|
||||
spin_unlock(&atmel_sha.lock);
|
||||
if (sha_dd->caps.has_dma)
|
||||
atmel_sha_dma_cleanup(sha_dd);
|
||||
err_sha_dma:
|
||||
err_pdata:
|
||||
iounmap(sha_dd->io_base);
|
||||
sha_io_err:
|
||||
clk_put(sha_dd->iclk);
|
||||
@ -1078,6 +1461,9 @@ static int atmel_sha_remove(struct platform_device *pdev)
|
||||
|
||||
tasklet_kill(&sha_dd->done_task);
|
||||
|
||||
if (sha_dd->caps.has_dma)
|
||||
atmel_sha_dma_cleanup(sha_dd);
|
||||
|
||||
iounmap(sha_dd->io_base);
|
||||
|
||||
clk_put(sha_dd->iclk);
|
||||
@ -1102,6 +1488,6 @@ static struct platform_driver atmel_sha_driver = {
|
||||
|
||||
module_platform_driver(atmel_sha_driver);
|
||||
|
||||
MODULE_DESCRIPTION("Atmel SHA1/SHA256 hw acceleration support.");
|
||||
MODULE_DESCRIPTION("Atmel SHA (1/256/224/384/512) hw acceleration support.");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR("Nicolas Royer - Eukréa Electromatique");
|
||||
|
@ -69,6 +69,8 @@
|
||||
#define TDES_XTEARNDR_XTEA_RNDS_MASK (0x3F << 0)
|
||||
#define TDES_XTEARNDR_XTEA_RNDS_OFFSET 0
|
||||
|
||||
#define TDES_HW_VERSION 0xFC
|
||||
|
||||
#define TDES_RPR 0x100
|
||||
#define TDES_RCR 0x104
|
||||
#define TDES_TPR 0x108
|
||||
|
@ -38,29 +38,35 @@
|
||||
#include <crypto/des.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/platform_data/crypto-atmel.h>
|
||||
#include "atmel-tdes-regs.h"
|
||||
|
||||
/* TDES flags */
|
||||
#define TDES_FLAGS_MODE_MASK 0x007f
|
||||
#define TDES_FLAGS_MODE_MASK 0x00ff
|
||||
#define TDES_FLAGS_ENCRYPT BIT(0)
|
||||
#define TDES_FLAGS_CBC BIT(1)
|
||||
#define TDES_FLAGS_CFB BIT(2)
|
||||
#define TDES_FLAGS_CFB8 BIT(3)
|
||||
#define TDES_FLAGS_CFB16 BIT(4)
|
||||
#define TDES_FLAGS_CFB32 BIT(5)
|
||||
#define TDES_FLAGS_OFB BIT(6)
|
||||
#define TDES_FLAGS_CFB64 BIT(6)
|
||||
#define TDES_FLAGS_OFB BIT(7)
|
||||
|
||||
#define TDES_FLAGS_INIT BIT(16)
|
||||
#define TDES_FLAGS_FAST BIT(17)
|
||||
#define TDES_FLAGS_BUSY BIT(18)
|
||||
#define TDES_FLAGS_DMA BIT(19)
|
||||
|
||||
#define ATMEL_TDES_QUEUE_LENGTH 1
|
||||
#define ATMEL_TDES_QUEUE_LENGTH 50
|
||||
|
||||
#define CFB8_BLOCK_SIZE 1
|
||||
#define CFB16_BLOCK_SIZE 2
|
||||
#define CFB32_BLOCK_SIZE 4
|
||||
#define CFB64_BLOCK_SIZE 8
|
||||
|
||||
struct atmel_tdes_caps {
|
||||
bool has_dma;
|
||||
u32 has_cfb_3keys;
|
||||
};
|
||||
|
||||
struct atmel_tdes_dev;
|
||||
|
||||
@ -70,12 +76,19 @@ struct atmel_tdes_ctx {
|
||||
int keylen;
|
||||
u32 key[3*DES_KEY_SIZE / sizeof(u32)];
|
||||
unsigned long flags;
|
||||
|
||||
u16 block_size;
|
||||
};
|
||||
|
||||
struct atmel_tdes_reqctx {
|
||||
unsigned long mode;
|
||||
};
|
||||
|
||||
struct atmel_tdes_dma {
|
||||
struct dma_chan *chan;
|
||||
struct dma_slave_config dma_conf;
|
||||
};
|
||||
|
||||
struct atmel_tdes_dev {
|
||||
struct list_head list;
|
||||
unsigned long phys_base;
|
||||
@ -99,8 +112,10 @@ struct atmel_tdes_dev {
|
||||
size_t total;
|
||||
|
||||
struct scatterlist *in_sg;
|
||||
unsigned int nb_in_sg;
|
||||
size_t in_offset;
|
||||
struct scatterlist *out_sg;
|
||||
unsigned int nb_out_sg;
|
||||
size_t out_offset;
|
||||
|
||||
size_t buflen;
|
||||
@ -109,10 +124,16 @@ struct atmel_tdes_dev {
|
||||
void *buf_in;
|
||||
int dma_in;
|
||||
dma_addr_t dma_addr_in;
|
||||
struct atmel_tdes_dma dma_lch_in;
|
||||
|
||||
void *buf_out;
|
||||
int dma_out;
|
||||
dma_addr_t dma_addr_out;
|
||||
struct atmel_tdes_dma dma_lch_out;
|
||||
|
||||
struct atmel_tdes_caps caps;
|
||||
|
||||
u32 hw_version;
|
||||
};
|
||||
|
||||
struct atmel_tdes_drv {
|
||||
@ -207,6 +228,31 @@ static int atmel_tdes_hw_init(struct atmel_tdes_dev *dd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int atmel_tdes_get_version(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
return atmel_tdes_read(dd, TDES_HW_VERSION) & 0x00000fff;
|
||||
}
|
||||
|
||||
static void atmel_tdes_hw_version_init(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
atmel_tdes_hw_init(dd);
|
||||
|
||||
dd->hw_version = atmel_tdes_get_version(dd);
|
||||
|
||||
dev_info(dd->dev,
|
||||
"version: 0x%x\n", dd->hw_version);
|
||||
|
||||
clk_disable_unprepare(dd->iclk);
|
||||
}
|
||||
|
||||
static void atmel_tdes_dma_callback(void *data)
|
||||
{
|
||||
struct atmel_tdes_dev *dd = data;
|
||||
|
||||
/* dma_lch_out - completed */
|
||||
tasklet_schedule(&dd->done_task);
|
||||
}
|
||||
|
||||
static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
int err;
|
||||
@ -217,7 +263,9 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
atmel_tdes_write(dd, TDES_PTCR, TDES_PTCR_TXTDIS|TDES_PTCR_RXTDIS);
|
||||
if (!dd->caps.has_dma)
|
||||
atmel_tdes_write(dd, TDES_PTCR,
|
||||
TDES_PTCR_TXTDIS | TDES_PTCR_RXTDIS);
|
||||
|
||||
/* MR register must be set before IV registers */
|
||||
if (dd->ctx->keylen > (DES_KEY_SIZE << 1)) {
|
||||
@ -241,6 +289,8 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
|
||||
valmr |= TDES_MR_CFBS_16b;
|
||||
else if (dd->flags & TDES_FLAGS_CFB32)
|
||||
valmr |= TDES_MR_CFBS_32b;
|
||||
else if (dd->flags & TDES_FLAGS_CFB64)
|
||||
valmr |= TDES_MR_CFBS_64b;
|
||||
} else if (dd->flags & TDES_FLAGS_OFB) {
|
||||
valmr |= TDES_MR_OPMOD_OFB;
|
||||
}
|
||||
@ -262,7 +312,7 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
|
||||
static int atmel_tdes_crypt_pdc_stop(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
int err = 0;
|
||||
size_t count;
|
||||
@ -288,7 +338,7 @@ static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
|
||||
return err;
|
||||
}
|
||||
|
||||
static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd)
|
||||
static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
|
||||
@ -333,7 +383,7 @@ err_alloc:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
|
||||
static void atmel_tdes_buff_cleanup(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
|
||||
DMA_FROM_DEVICE);
|
||||
@ -343,7 +393,7 @@ static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
|
||||
free_page((unsigned long)dd->buf_in);
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
|
||||
static int atmel_tdes_crypt_pdc(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
|
||||
dma_addr_t dma_addr_out, int length)
|
||||
{
|
||||
struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
@ -379,7 +429,76 @@ static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
|
||||
static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
|
||||
dma_addr_t dma_addr_out, int length)
|
||||
{
|
||||
struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
struct atmel_tdes_dev *dd = ctx->dd;
|
||||
struct scatterlist sg[2];
|
||||
struct dma_async_tx_descriptor *in_desc, *out_desc;
|
||||
|
||||
dd->dma_size = length;
|
||||
|
||||
if (!(dd->flags & TDES_FLAGS_FAST)) {
|
||||
dma_sync_single_for_device(dd->dev, dma_addr_in, length,
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
if (dd->flags & TDES_FLAGS_CFB8) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_1_BYTE;
|
||||
} else if (dd->flags & TDES_FLAGS_CFB16) {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_2_BYTES;
|
||||
} else {
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
}
|
||||
|
||||
dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
|
||||
dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
|
||||
|
||||
dd->flags |= TDES_FLAGS_DMA;
|
||||
|
||||
sg_init_table(&sg[0], 1);
|
||||
sg_dma_address(&sg[0]) = dma_addr_in;
|
||||
sg_dma_len(&sg[0]) = length;
|
||||
|
||||
sg_init_table(&sg[1], 1);
|
||||
sg_dma_address(&sg[1]) = dma_addr_out;
|
||||
sg_dma_len(&sg[1]) = length;
|
||||
|
||||
in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
|
||||
1, DMA_MEM_TO_DEV,
|
||||
DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
if (!in_desc)
|
||||
return -EINVAL;
|
||||
|
||||
out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
|
||||
1, DMA_DEV_TO_MEM,
|
||||
DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
|
||||
if (!out_desc)
|
||||
return -EINVAL;
|
||||
|
||||
out_desc->callback = atmel_tdes_dma_callback;
|
||||
out_desc->callback_param = dd;
|
||||
|
||||
dmaengine_submit(out_desc);
|
||||
dma_async_issue_pending(dd->dma_lch_out.chan);
|
||||
|
||||
dmaengine_submit(in_desc);
|
||||
dma_async_issue_pending(dd->dma_lch_in.chan);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt_start(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
struct crypto_tfm *tfm = crypto_ablkcipher_tfm(
|
||||
crypto_ablkcipher_reqtfm(dd->req));
|
||||
@ -387,23 +506,23 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
|
||||
size_t count;
|
||||
dma_addr_t addr_in, addr_out;
|
||||
|
||||
if (sg_is_last(dd->in_sg) && sg_is_last(dd->out_sg)) {
|
||||
if ((!dd->in_offset) && (!dd->out_offset)) {
|
||||
/* check for alignment */
|
||||
in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32));
|
||||
out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32));
|
||||
|
||||
in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
|
||||
IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
|
||||
out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
|
||||
IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
|
||||
fast = in && out;
|
||||
|
||||
if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
|
||||
fast = 0;
|
||||
}
|
||||
|
||||
|
||||
if (fast) {
|
||||
count = min(dd->total, sg_dma_len(dd->in_sg));
|
||||
count = min(count, sg_dma_len(dd->out_sg));
|
||||
|
||||
if (count != dd->total) {
|
||||
pr_err("request length != buffer length\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
if (!err) {
|
||||
dev_err(dd->dev, "dma_map_sg() error\n");
|
||||
@ -433,13 +552,16 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
|
||||
addr_out = dd->dma_addr_out;
|
||||
|
||||
dd->flags &= ~TDES_FLAGS_FAST;
|
||||
|
||||
}
|
||||
|
||||
dd->total -= count;
|
||||
|
||||
err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
|
||||
if (err) {
|
||||
if (dd->caps.has_dma)
|
||||
err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
|
||||
else
|
||||
err = atmel_tdes_crypt_pdc(tfm, addr_in, addr_out, count);
|
||||
|
||||
if (err && (dd->flags & TDES_FLAGS_FAST)) {
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
|
||||
}
|
||||
@ -447,7 +569,6 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err)
|
||||
{
|
||||
struct ablkcipher_request *req = dd->req;
|
||||
@ -506,7 +627,7 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
|
||||
|
||||
err = atmel_tdes_write_ctrl(dd);
|
||||
if (!err)
|
||||
err = atmel_tdes_crypt_dma_start(dd);
|
||||
err = atmel_tdes_crypt_start(dd);
|
||||
if (err) {
|
||||
/* des_task will not finish it, so do it here */
|
||||
atmel_tdes_finish_req(dd, err);
|
||||
@ -516,41 +637,145 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
int err = -EINVAL;
|
||||
size_t count;
|
||||
|
||||
if (dd->flags & TDES_FLAGS_DMA) {
|
||||
err = 0;
|
||||
if (dd->flags & TDES_FLAGS_FAST) {
|
||||
dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
|
||||
dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
|
||||
} else {
|
||||
dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
|
||||
dd->dma_size, DMA_FROM_DEVICE);
|
||||
|
||||
/* copy data */
|
||||
count = atmel_tdes_sg_copy(&dd->out_sg, &dd->out_offset,
|
||||
dd->buf_out, dd->buflen, dd->dma_size, 1);
|
||||
if (count != dd->dma_size) {
|
||||
err = -EINVAL;
|
||||
pr_err("not all data converted: %u\n", count);
|
||||
}
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int atmel_tdes_crypt(struct ablkcipher_request *req, unsigned long mode)
|
||||
{
|
||||
struct atmel_tdes_ctx *ctx = crypto_ablkcipher_ctx(
|
||||
crypto_ablkcipher_reqtfm(req));
|
||||
struct atmel_tdes_reqctx *rctx = ablkcipher_request_ctx(req);
|
||||
struct atmel_tdes_dev *dd;
|
||||
|
||||
if (mode & TDES_FLAGS_CFB8) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB8 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = CFB8_BLOCK_SIZE;
|
||||
} else if (mode & TDES_FLAGS_CFB16) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB16 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = CFB16_BLOCK_SIZE;
|
||||
} else if (mode & TDES_FLAGS_CFB32) {
|
||||
if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of CFB32 blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
} else if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of DES blocks\n");
|
||||
return -EINVAL;
|
||||
ctx->block_size = CFB32_BLOCK_SIZE;
|
||||
} else {
|
||||
if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
|
||||
pr_err("request size is not exact amount of DES blocks\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
ctx->block_size = DES_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
dd = atmel_tdes_find_dev(ctx);
|
||||
if (!dd)
|
||||
return -ENODEV;
|
||||
|
||||
rctx->mode = mode;
|
||||
|
||||
return atmel_tdes_handle_queue(dd, req);
|
||||
return atmel_tdes_handle_queue(ctx->dd, req);
|
||||
}
|
||||
|
||||
static bool atmel_tdes_filter(struct dma_chan *chan, void *slave)
|
||||
{
|
||||
struct at_dma_slave *sl = slave;
|
||||
|
||||
if (sl && sl->dma_dev == chan->device->dev) {
|
||||
chan->private = sl;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd,
|
||||
struct crypto_platform_data *pdata)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
dma_cap_mask_t mask_in, mask_out;
|
||||
|
||||
if (pdata && pdata->dma_slave->txdata.dma_dev &&
|
||||
pdata->dma_slave->rxdata.dma_dev) {
|
||||
|
||||
/* Try to grab 2 DMA channels */
|
||||
dma_cap_zero(mask_in);
|
||||
dma_cap_set(DMA_SLAVE, mask_in);
|
||||
|
||||
dd->dma_lch_in.chan = dma_request_channel(mask_in,
|
||||
atmel_tdes_filter, &pdata->dma_slave->rxdata);
|
||||
|
||||
if (!dd->dma_lch_in.chan)
|
||||
goto err_dma_in;
|
||||
|
||||
dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
|
||||
dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
|
||||
TDES_IDATA1R;
|
||||
dd->dma_lch_in.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_in.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_in.dma_conf.device_fc = false;
|
||||
|
||||
dma_cap_zero(mask_out);
|
||||
dma_cap_set(DMA_SLAVE, mask_out);
|
||||
dd->dma_lch_out.chan = dma_request_channel(mask_out,
|
||||
atmel_tdes_filter, &pdata->dma_slave->txdata);
|
||||
|
||||
if (!dd->dma_lch_out.chan)
|
||||
goto err_dma_out;
|
||||
|
||||
dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
|
||||
dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
|
||||
TDES_ODATA1R;
|
||||
dd->dma_lch_out.dma_conf.src_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.src_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.dst_maxburst = 1;
|
||||
dd->dma_lch_out.dma_conf.dst_addr_width =
|
||||
DMA_SLAVE_BUSWIDTH_4_BYTES;
|
||||
dd->dma_lch_out.dma_conf.device_fc = false;
|
||||
|
||||
return 0;
|
||||
} else {
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
err_dma_out:
|
||||
dma_release_channel(dd->dma_lch_in.chan);
|
||||
err_dma_in:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
dma_release_channel(dd->dma_lch_in.chan);
|
||||
dma_release_channel(dd->dma_lch_out.chan);
|
||||
}
|
||||
|
||||
static int atmel_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
|
||||
@ -590,7 +815,8 @@ static int atmel_tdes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
|
||||
/*
|
||||
* HW bug in cfb 3-keys mode.
|
||||
*/
|
||||
if (strstr(alg_name, "cfb") && (keylen != 2*DES_KEY_SIZE)) {
|
||||
if (!ctx->dd->caps.has_cfb_3keys && strstr(alg_name, "cfb")
|
||||
&& (keylen != 2*DES_KEY_SIZE)) {
|
||||
crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
} else if ((keylen != 2*DES_KEY_SIZE) && (keylen != 3*DES_KEY_SIZE)) {
|
||||
@ -678,8 +904,15 @@ static int atmel_tdes_ofb_decrypt(struct ablkcipher_request *req)
|
||||
|
||||
static int atmel_tdes_cra_init(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
struct atmel_tdes_dev *dd;
|
||||
|
||||
tfm->crt_ablkcipher.reqsize = sizeof(struct atmel_tdes_reqctx);
|
||||
|
||||
dd = atmel_tdes_find_dev(ctx);
|
||||
if (!dd)
|
||||
return -ENODEV;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -695,7 +928,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -715,7 +948,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -736,7 +969,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -778,7 +1011,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB16_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x1,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -799,7 +1032,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB32_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x3,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -820,7 +1053,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -841,7 +1074,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -861,7 +1094,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -882,7 +1115,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -924,7 +1157,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB16_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x1,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -945,7 +1178,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CFB32_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x3,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -966,7 +1199,7 @@ static struct crypto_alg tdes_algs[] = {
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = DES_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct atmel_tdes_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_alignmask = 0x7,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = atmel_tdes_cra_init,
|
||||
@ -994,14 +1227,24 @@ static void atmel_tdes_done_task(unsigned long data)
|
||||
struct atmel_tdes_dev *dd = (struct atmel_tdes_dev *) data;
|
||||
int err;
|
||||
|
||||
err = atmel_tdes_crypt_dma_stop(dd);
|
||||
if (!(dd->flags & TDES_FLAGS_DMA))
|
||||
err = atmel_tdes_crypt_pdc_stop(dd);
|
||||
else
|
||||
err = atmel_tdes_crypt_dma_stop(dd);
|
||||
|
||||
err = dd->err ? : err;
|
||||
|
||||
if (dd->total && !err) {
|
||||
err = atmel_tdes_crypt_dma_start(dd);
|
||||
if (dd->flags & TDES_FLAGS_FAST) {
|
||||
dd->in_sg = sg_next(dd->in_sg);
|
||||
dd->out_sg = sg_next(dd->out_sg);
|
||||
if (!dd->in_sg || !dd->out_sg)
|
||||
err = -EINVAL;
|
||||
}
|
||||
if (!err)
|
||||
return;
|
||||
err = atmel_tdes_crypt_start(dd);
|
||||
if (!err)
|
||||
return; /* DMA started. Not fininishing. */
|
||||
}
|
||||
|
||||
atmel_tdes_finish_req(dd, err);
|
||||
@ -1053,9 +1296,31 @@ err_tdes_algs:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void atmel_tdes_get_cap(struct atmel_tdes_dev *dd)
|
||||
{
|
||||
|
||||
dd->caps.has_dma = 0;
|
||||
dd->caps.has_cfb_3keys = 0;
|
||||
|
||||
/* keep only major version number */
|
||||
switch (dd->hw_version & 0xf00) {
|
||||
case 0x700:
|
||||
dd->caps.has_dma = 1;
|
||||
dd->caps.has_cfb_3keys = 1;
|
||||
break;
|
||||
case 0x600:
|
||||
break;
|
||||
default:
|
||||
dev_warn(dd->dev,
|
||||
"Unmanaged tdes version, set minimum capabilities\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int atmel_tdes_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct atmel_tdes_dev *tdes_dd;
|
||||
struct crypto_platform_data *pdata;
|
||||
struct device *dev = &pdev->dev;
|
||||
struct resource *tdes_res;
|
||||
unsigned long tdes_phys_size;
|
||||
@ -1109,7 +1374,7 @@ static int atmel_tdes_probe(struct platform_device *pdev)
|
||||
}
|
||||
|
||||
/* Initializing the clock */
|
||||
tdes_dd->iclk = clk_get(&pdev->dev, NULL);
|
||||
tdes_dd->iclk = clk_get(&pdev->dev, "tdes_clk");
|
||||
if (IS_ERR(tdes_dd->iclk)) {
|
||||
dev_err(dev, "clock intialization failed.\n");
|
||||
err = PTR_ERR(tdes_dd->iclk);
|
||||
@ -1123,9 +1388,25 @@ static int atmel_tdes_probe(struct platform_device *pdev)
|
||||
goto tdes_io_err;
|
||||
}
|
||||
|
||||
err = atmel_tdes_dma_init(tdes_dd);
|
||||
atmel_tdes_hw_version_init(tdes_dd);
|
||||
|
||||
atmel_tdes_get_cap(tdes_dd);
|
||||
|
||||
err = atmel_tdes_buff_init(tdes_dd);
|
||||
if (err)
|
||||
goto err_tdes_dma;
|
||||
goto err_tdes_buff;
|
||||
|
||||
if (tdes_dd->caps.has_dma) {
|
||||
pdata = pdev->dev.platform_data;
|
||||
if (!pdata) {
|
||||
dev_err(&pdev->dev, "platform data not available\n");
|
||||
err = -ENXIO;
|
||||
goto err_pdata;
|
||||
}
|
||||
err = atmel_tdes_dma_init(tdes_dd, pdata);
|
||||
if (err)
|
||||
goto err_tdes_dma;
|
||||
}
|
||||
|
||||
spin_lock(&atmel_tdes.lock);
|
||||
list_add_tail(&tdes_dd->list, &atmel_tdes.dev_list);
|
||||
@ -1143,8 +1424,12 @@ err_algs:
|
||||
spin_lock(&atmel_tdes.lock);
|
||||
list_del(&tdes_dd->list);
|
||||
spin_unlock(&atmel_tdes.lock);
|
||||
atmel_tdes_dma_cleanup(tdes_dd);
|
||||
if (tdes_dd->caps.has_dma)
|
||||
atmel_tdes_dma_cleanup(tdes_dd);
|
||||
err_tdes_dma:
|
||||
err_pdata:
|
||||
atmel_tdes_buff_cleanup(tdes_dd);
|
||||
err_tdes_buff:
|
||||
iounmap(tdes_dd->io_base);
|
||||
tdes_io_err:
|
||||
clk_put(tdes_dd->iclk);
|
||||
@ -1178,7 +1463,10 @@ static int atmel_tdes_remove(struct platform_device *pdev)
|
||||
tasklet_kill(&tdes_dd->done_task);
|
||||
tasklet_kill(&tdes_dd->queue_task);
|
||||
|
||||
atmel_tdes_dma_cleanup(tdes_dd);
|
||||
if (tdes_dd->caps.has_dma)
|
||||
atmel_tdes_dma_cleanup(tdes_dd);
|
||||
|
||||
atmel_tdes_buff_cleanup(tdes_dd);
|
||||
|
||||
iounmap(tdes_dd->io_base);
|
||||
|
||||
|
@ -151,7 +151,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
|
||||
struct bfin_crypto_crc_reqctx *ctx = ahash_request_ctx(req);
|
||||
struct bfin_crypto_crc *crc;
|
||||
|
||||
dev_dbg(crc->dev, "crc_init\n");
|
||||
dev_dbg(ctx->crc->dev, "crc_init\n");
|
||||
spin_lock_bh(&crc_list.lock);
|
||||
list_for_each_entry(crc, &crc_list.dev_list, list) {
|
||||
crc_ctx->crc = crc;
|
||||
@ -160,7 +160,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
|
||||
spin_unlock_bh(&crc_list.lock);
|
||||
|
||||
if (sg_count(req->src) > CRC_MAX_DMA_DESC) {
|
||||
dev_dbg(crc->dev, "init: requested sg list is too big > %d\n",
|
||||
dev_dbg(ctx->crc->dev, "init: requested sg list is too big > %d\n",
|
||||
CRC_MAX_DMA_DESC);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -175,7 +175,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
|
||||
/* init crc results */
|
||||
put_unaligned_le32(crc_ctx->key, req->result);
|
||||
|
||||
dev_dbg(crc->dev, "init: digest size: %d\n",
|
||||
dev_dbg(ctx->crc->dev, "init: digest size: %d\n",
|
||||
crypto_ahash_digestsize(tfm));
|
||||
|
||||
return bfin_crypto_crc_init_hw(crc, crc_ctx->key);
|
||||
|
@ -78,7 +78,7 @@ config CRYPTO_DEV_FSL_CAAM_AHASH_API
|
||||
tristate "Register hash algorithm implementations with Crypto API"
|
||||
depends on CRYPTO_DEV_FSL_CAAM
|
||||
default y
|
||||
select CRYPTO_AHASH
|
||||
select CRYPTO_HASH
|
||||
help
|
||||
Selecting this will offload ahash for users of the
|
||||
scatterlist crypto API to the SEC4 via job ring.
|
||||
|
@ -1693,6 +1693,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha224),cbc(aes))",
|
||||
.driver_name = "authenc-hmac-sha224-cbc-aes-caam",
|
||||
.blocksize = AES_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
@ -1732,6 +1733,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha384),cbc(aes))",
|
||||
.driver_name = "authenc-hmac-sha384-cbc-aes-caam",
|
||||
.blocksize = AES_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
@ -1810,6 +1812,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha224),cbc(des3_ede))",
|
||||
.driver_name = "authenc-hmac-sha224-cbc-des3_ede-caam",
|
||||
.blocksize = DES3_EDE_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
@ -1849,6 +1852,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha384),cbc(des3_ede))",
|
||||
.driver_name = "authenc-hmac-sha384-cbc-des3_ede-caam",
|
||||
.blocksize = DES3_EDE_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
@ -1926,6 +1930,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha224),cbc(des))",
|
||||
.driver_name = "authenc-hmac-sha224-cbc-des-caam",
|
||||
.blocksize = DES_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
@ -1965,6 +1970,7 @@ static struct caam_alg_template driver_algs[] = {
|
||||
.name = "authenc(hmac(sha384),cbc(des))",
|
||||
.driver_name = "authenc-hmac-sha384-cbc-des-caam",
|
||||
.blocksize = DES_BLOCK_SIZE,
|
||||
.type = CRYPTO_ALG_TYPE_AEAD,
|
||||
.template_aead = {
|
||||
.setkey = aead_setkey,
|
||||
.setauthsize = aead_setauthsize,
|
||||
|
@ -411,7 +411,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
|
||||
static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
|
||||
u32 keylen)
|
||||
{
|
||||
return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
|
||||
@ -420,7 +420,7 @@ static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
|
||||
}
|
||||
|
||||
/* Digest hash size if it is too large */
|
||||
static u32 hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
|
||||
static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
|
||||
u32 *keylen, u8 *key_out, u32 digestsize)
|
||||
{
|
||||
struct device *jrdev = ctx->jrdev;
|
||||
|
@ -304,6 +304,9 @@ static int caam_probe(struct platform_device *pdev)
|
||||
caam_remove(pdev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Enable RDB bit so that RNG works faster */
|
||||
setbits32(&topregs->ctrl.scfgr, SCFGR_RDBENABLE);
|
||||
}
|
||||
|
||||
/* NOTE: RTIC detection ought to go here, around Si time */
|
||||
|
@ -36,7 +36,7 @@ static void report_jump_idx(u32 status, char *outstr)
|
||||
|
||||
static void report_ccb_status(u32 status, char *outstr)
|
||||
{
|
||||
char *cha_id_list[] = {
|
||||
static const char * const cha_id_list[] = {
|
||||
"",
|
||||
"AES",
|
||||
"DES",
|
||||
@ -51,7 +51,7 @@ static void report_ccb_status(u32 status, char *outstr)
|
||||
"ZUCE",
|
||||
"ZUCA",
|
||||
};
|
||||
char *err_id_list[] = {
|
||||
static const char * const err_id_list[] = {
|
||||
"No error.",
|
||||
"Mode error.",
|
||||
"Data size error.",
|
||||
@ -69,7 +69,7 @@ static void report_ccb_status(u32 status, char *outstr)
|
||||
"Invalid CHA combination was selected",
|
||||
"Invalid CHA selected.",
|
||||
};
|
||||
char *rng_err_id_list[] = {
|
||||
static const char * const rng_err_id_list[] = {
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
@ -117,7 +117,7 @@ static void report_jump_status(u32 status, char *outstr)
|
||||
|
||||
static void report_deco_status(u32 status, char *outstr)
|
||||
{
|
||||
const struct {
|
||||
static const struct {
|
||||
u8 value;
|
||||
char *error_text;
|
||||
} desc_error_list[] = {
|
||||
@ -245,7 +245,7 @@ static void report_cond_code_status(u32 status, char *outstr)
|
||||
|
||||
char *caam_jr_strstatus(char *outstr, u32 status)
|
||||
{
|
||||
struct stat_src {
|
||||
static const struct stat_src {
|
||||
void (*report_ssed)(u32 status, char *outstr);
|
||||
char *error;
|
||||
} status_src[] = {
|
||||
|
@ -41,6 +41,7 @@ struct caam_jrentry_info {
|
||||
/* Private sub-storage for a single JobR */
|
||||
struct caam_drv_private_jr {
|
||||
struct device *parentdev; /* points back to controller dev */
|
||||
struct platform_device *jr_pdev;/* points to platform device for JR */
|
||||
int ridx;
|
||||
struct caam_job_ring __iomem *rregs; /* JobR's register space */
|
||||
struct tasklet_struct irqtask;
|
||||
|
@ -407,6 +407,7 @@ int caam_jr_shutdown(struct device *dev)
|
||||
dma_free_coherent(dev, sizeof(struct jr_outentry) * JOBR_DEPTH,
|
||||
jrp->outring, outbusaddr);
|
||||
kfree(jrp->entinfo);
|
||||
of_device_unregister(jrp->jr_pdev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -454,6 +455,8 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
|
||||
kfree(jrpriv);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
jrpriv->jr_pdev = jr_pdev;
|
||||
jrdev = &jr_pdev->dev;
|
||||
dev_set_drvdata(jrdev, jrpriv);
|
||||
ctrlpriv->jrdev[ring] = jrdev;
|
||||
@ -472,6 +475,7 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
|
||||
/* Now do the platform independent part */
|
||||
error = caam_jr_init(jrdev); /* now turn on hardware */
|
||||
if (error) {
|
||||
of_device_unregister(jr_pdev);
|
||||
kfree(jrpriv);
|
||||
return error;
|
||||
}
|
||||
|
@ -44,7 +44,7 @@ Split key generation-----------------------------------------------
|
||||
[06] 0x64260028 fifostr: class2 mdsplit-jdk len=40
|
||||
@0xffe04000
|
||||
*/
|
||||
u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
|
||||
int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
|
||||
int split_key_pad_len, const u8 *key_in, u32 keylen,
|
||||
u32 alg_op)
|
||||
{
|
||||
|
@ -12,6 +12,6 @@ struct split_key_result {
|
||||
|
||||
void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
|
||||
|
||||
u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
|
||||
int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
|
||||
int split_key_pad_len, const u8 *key_in, u32 keylen,
|
||||
u32 alg_op);
|
||||
|
@ -252,7 +252,8 @@ struct caam_ctrl {
|
||||
/* Read/Writable */
|
||||
u32 rsvd1;
|
||||
u32 mcr; /* MCFG Master Config Register */
|
||||
u32 rsvd2[2];
|
||||
u32 rsvd2;
|
||||
u32 scfgr; /* SCFGR, Security Config Register */
|
||||
|
||||
/* Bus Access Configuration Section 010-11f */
|
||||
/* Read/Writable */
|
||||
@ -299,6 +300,7 @@ struct caam_ctrl {
|
||||
#define MCFGR_WDFAIL 0x20000000 /* DECO watchdog force-fail */
|
||||
#define MCFGR_DMA_RESET 0x10000000
|
||||
#define MCFGR_LONG_PTR 0x00010000 /* Use >32-bit desc addressing */
|
||||
#define SCFGR_RDBENABLE 0x00000400
|
||||
|
||||
/* AXI read cache control */
|
||||
#define MCFGR_ARCACHE_SHIFT 12
|
||||
|
@ -636,7 +636,7 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
|
||||
|
||||
pr_debug("err: %d\n", err);
|
||||
|
||||
pm_runtime_put_sync(dd->dev);
|
||||
pm_runtime_put(dd->dev);
|
||||
dd->flags &= ~FLAGS_BUSY;
|
||||
|
||||
req->base.complete(&req->base, err);
|
||||
@ -1248,18 +1248,7 @@ static struct platform_driver omap_aes_driver = {
|
||||
},
|
||||
};
|
||||
|
||||
static int __init omap_aes_mod_init(void)
|
||||
{
|
||||
return platform_driver_register(&omap_aes_driver);
|
||||
}
|
||||
|
||||
static void __exit omap_aes_mod_exit(void)
|
||||
{
|
||||
platform_driver_unregister(&omap_aes_driver);
|
||||
}
|
||||
|
||||
module_init(omap_aes_mod_init);
|
||||
module_exit(omap_aes_mod_exit);
|
||||
module_platform_driver(omap_aes_driver);
|
||||
|
||||
MODULE_DESCRIPTION("OMAP AES hw acceleration support.");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
@ -923,7 +923,7 @@ static void omap_sham_finish_req(struct ahash_request *req, int err)
|
||||
dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) |
|
||||
BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY));
|
||||
|
||||
pm_runtime_put_sync(dd->dev);
|
||||
pm_runtime_put(dd->dev);
|
||||
|
||||
if (req->base.complete)
|
||||
req->base.complete(&req->base, err);
|
||||
@ -1813,18 +1813,7 @@ static struct platform_driver omap_sham_driver = {
|
||||
},
|
||||
};
|
||||
|
||||
static int __init omap_sham_mod_init(void)
|
||||
{
|
||||
return platform_driver_register(&omap_sham_driver);
|
||||
}
|
||||
|
||||
static void __exit omap_sham_mod_exit(void)
|
||||
{
|
||||
platform_driver_unregister(&omap_sham_driver);
|
||||
}
|
||||
|
||||
module_init(omap_sham_mod_init);
|
||||
module_exit(omap_sham_mod_exit);
|
||||
module_platform_driver(omap_sham_driver);
|
||||
|
||||
MODULE_DESCRIPTION("OMAP SHA1/MD5 hw acceleration support.");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
@ -1688,8 +1688,6 @@ static const struct of_device_id spacc_of_id_table[] = {
|
||||
{ .compatible = "picochip,spacc-l2" },
|
||||
{}
|
||||
};
|
||||
#else /* CONFIG_OF */
|
||||
#define spacc_of_id_table NULL
|
||||
#endif /* CONFIG_OF */
|
||||
|
||||
static bool spacc_is_compatible(struct platform_device *pdev,
|
||||
@ -1874,7 +1872,7 @@ static struct platform_driver spacc_driver = {
|
||||
#ifdef CONFIG_PM
|
||||
.pm = &spacc_pm_ops,
|
||||
#endif /* CONFIG_PM */
|
||||
.of_match_table = spacc_of_id_table,
|
||||
.of_match_table = of_match_ptr(spacc_of_id_table),
|
||||
},
|
||||
.id_table = spacc_id_table,
|
||||
};
|
||||
|
1070
drivers/crypto/sahara.c
Normal file
1070
drivers/crypto/sahara.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -938,6 +938,7 @@ static int hash_dma_final(struct ahash_request *req)
|
||||
if (!ctx->device->dma.nents) {
|
||||
dev_err(device_data->dev, "[%s] "
|
||||
"ctx->device->dma.nents = 0", __func__);
|
||||
ret = ctx->device->dma.nents;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -945,6 +946,7 @@ static int hash_dma_final(struct ahash_request *req)
|
||||
if (bytes_written != req->nbytes) {
|
||||
dev_err(device_data->dev, "[%s] "
|
||||
"hash_dma_write() failed!", __func__);
|
||||
ret = bytes_written;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1367,14 +1369,12 @@ static int hash_setkey(struct crypto_ahash *tfm,
|
||||
/**
|
||||
* Freed in final.
|
||||
*/
|
||||
ctx->key = kmalloc(keylen, GFP_KERNEL);
|
||||
ctx->key = kmemdup(key, keylen, GFP_KERNEL);
|
||||
if (!ctx->key) {
|
||||
pr_err(DEV_DBG_NAME " [%s] Failed to allocate ctx->key "
|
||||
"for %d\n", __func__, alg);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memcpy(ctx->key, key, keylen);
|
||||
ctx->keylen = keylen;
|
||||
|
||||
return ret;
|
||||
|
@ -87,4 +87,9 @@ struct shash_desc;
|
||||
extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len);
|
||||
|
||||
extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len);
|
||||
|
||||
extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len);
|
||||
#endif
|
||||
|
@ -1,22 +0,0 @@
|
||||
#ifndef __LINUX_ATMEL_AES_H
|
||||
#define __LINUX_ATMEL_AES_H
|
||||
|
||||
#include <linux/platform_data/dma-atmel.h>
|
||||
|
||||
/**
|
||||
* struct aes_dma_data - DMA data for AES
|
||||
*/
|
||||
struct aes_dma_data {
|
||||
struct at_dma_slave txdata;
|
||||
struct at_dma_slave rxdata;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct aes_platform_data - board-specific AES configuration
|
||||
* @dma_slave: DMA slave interface to use in data transfers.
|
||||
*/
|
||||
struct aes_platform_data {
|
||||
struct aes_dma_data *dma_slave;
|
||||
};
|
||||
|
||||
#endif /* __LINUX_ATMEL_AES_H */
|
22
include/linux/platform_data/crypto-atmel.h
Normal file
22
include/linux/platform_data/crypto-atmel.h
Normal file
@ -0,0 +1,22 @@
|
||||
#ifndef __LINUX_CRYPTO_ATMEL_H
|
||||
#define __LINUX_CRYPTO_ATMEL_H
|
||||
|
||||
#include <linux/platform_data/dma-atmel.h>
|
||||
|
||||
/**
|
||||
* struct crypto_dma_data - DMA data for AES/TDES/SHA
|
||||
*/
|
||||
struct crypto_dma_data {
|
||||
struct at_dma_slave txdata;
|
||||
struct at_dma_slave rxdata;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct crypto_platform_data - board-specific AES/TDES/SHA configuration
|
||||
* @dma_slave: DMA slave interface to use in data transfers.
|
||||
*/
|
||||
struct crypto_platform_data {
|
||||
struct crypto_dma_data *dma_slave;
|
||||
};
|
||||
|
||||
#endif /* __LINUX_CRYPTO_ATMEL_H */
|
@ -8,12 +8,7 @@
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/completion.h>
|
||||
|
||||
struct timeriomem_rng_data {
|
||||
struct completion completion;
|
||||
unsigned int present:1;
|
||||
|
||||
void __iomem *address;
|
||||
|
||||
/* measures in usecs */
|
||||
|
@ -311,6 +311,19 @@ static struct xfrm_algo_desc aalg_list[] = {
|
||||
.sadb_alg_maxbits = 128
|
||||
}
|
||||
},
|
||||
{
|
||||
/* rfc4494 */
|
||||
.name = "cmac(aes)",
|
||||
|
||||
.uinfo = {
|
||||
.auth = {
|
||||
.icv_truncbits = 96,
|
||||
.icv_fullbits = 128,
|
||||
}
|
||||
},
|
||||
|
||||
.pfkey_supported = 0,
|
||||
},
|
||||
};
|
||||
|
||||
static struct xfrm_algo_desc ealg_list[] = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user