From 286233e604d79f0c7fa04abec2180d5d89a74749 Mon Sep 17 00:00:00 2001 From: Horia Geanta Date: Fri, 10 May 2013 15:08:39 +0300 Subject: [PATCH 1/6] crypto: caam - fix inconsistent assoc dma mapping direction req->assoc is dma mapped BIDIRECTIONAL and unmapped TO_DEVICE. Since it is read-only for the device, use TO_DEVICE both for mapping and unmapping. Cc: # 3.9, 3.8 Signed-off-by: Horia Geanta Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index 765fdf5ce579..bf416a8391a7 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -1154,7 +1154,7 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req, dst_nents = sg_count(req->dst, req->cryptlen, &dst_chained); sgc = dma_map_sg_chained(jrdev, req->assoc, assoc_nents ? : 1, - DMA_BIDIRECTIONAL, assoc_chained); + DMA_TO_DEVICE, assoc_chained); if (likely(req->src == req->dst)) { sgc = dma_map_sg_chained(jrdev, req->src, src_nents ? : 1, DMA_BIDIRECTIONAL, src_chained); @@ -1336,7 +1336,7 @@ static struct aead_edesc *aead_giv_edesc_alloc(struct aead_givcrypt_request dst_nents = sg_count(req->dst, req->cryptlen, &dst_chained); sgc = dma_map_sg_chained(jrdev, req->assoc, assoc_nents ? : 1, - DMA_BIDIRECTIONAL, assoc_chained); + DMA_TO_DEVICE, assoc_chained); if (likely(req->src == req->dst)) { sgc = dma_map_sg_chained(jrdev, req->src, src_nents ? : 1, DMA_BIDIRECTIONAL, src_chained); From de614e561b9c633073caae8f86399aa8923ef85d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 21 May 2013 17:09:41 +0300 Subject: [PATCH 2/6] crypto: sha256_ssse3 - fix stack corruption with SSSE3 and AVX implementations The _XFER stack element size was set too small, 8 bytes, when it needs to be 16 bytes. As _XFER is the last stack element used by these implementations, the 16 byte stores with 'movdqa' corrupt the stack where the value of register %r12 is temporarily stored. As these implementations align the stack pointer to 16 bytes, this corruption did not happen every time. Patch corrects this issue. Reported-by: Julian Wollrath Signed-off-by: Jussi Kivilinna Tested-by: Julian Wollrath Acked-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx-asm.S | 2 +- arch/x86/crypto/sha256-ssse3-asm.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 56610c4bf31b..642f15687a0a 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -118,7 +118,7 @@ y2 = %r15d _INP_END_SIZE = 8 _INP_SIZE = 8 -_XFER_SIZE = 8 +_XFER_SIZE = 16 _XMM_SAVE_SIZE = 0 _INP_END = 0 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 98d3c391da81..f833b74d902b 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -111,7 +111,7 @@ y2 = %r15d _INP_END_SIZE = 8 _INP_SIZE = 8 -_XFER_SIZE = 8 +_XFER_SIZE = 16 _XMM_SAVE_SIZE = 0 _INP_END = 0 From 3ef91f21a66826cf1003bb7b3c51ac2de4c28182 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 2 Jun 2013 19:51:47 +0300 Subject: [PATCH 3/6] crypto: twofish - disable AVX2 implementation It appears that the performance of 'vpgatherdd' is suboptimal for this kind of workload (tested on Core i5-4570) and causes twofish_avx2 to be significantly slower than twofish_avx. So disable the AVX2 implementation to avoid performance regressions. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index 622d8a48cbe9..27720af741b4 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1299,6 +1299,7 @@ config CRYPTO_TWOFISH_AVX_X86_64 config CRYPTO_TWOFISH_AVX2_X86_64 tristate "Twofish cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on BROKEN select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 From edb7c7cdba90c3fb2339e51c15d21982bdf72fdc Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 2 Jun 2013 19:51:52 +0300 Subject: [PATCH 4/6] crypto: blowfish - disable AVX2 implementation It appears that the performance of 'vpgatherdd' is suboptimal for this kind of workload (tested on Core i5-4570) and causes blowfish-avx2 to be significantly slower than blowfish-amd64. So disable the AVX2 implementation to avoid performance regressions. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- crypto/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index 27720af741b4..bf8148e74e73 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -823,6 +823,7 @@ config CRYPTO_BLOWFISH_X86_64 config CRYPTO_BLOWFISH_AVX2_X86_64 tristate "Blowfish cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on BROKEN select CRYPTO_ALGAPI select CRYPTO_CRYPTD select CRYPTO_ABLK_HELPER_X86 From 68be0b1ae355c6deb11326df6758f80154f44cf0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Jun 2013 23:57:37 +0200 Subject: [PATCH 5/6] crypto: sahara - fix building as module The sahara crypto driver has an incorrect MODULE_DEVICE_TABLE, which prevents us from actually building this driver as a loadable module. sahara_dt_ids is a of_device_id array, so we have to use MODULE_DEVICE_TABLE(of, ...). Signed-off-by: Arnd Bergmann Cc: Javier Martin Cc: Herbert Xu Signed-off-by: Herbert Xu --- drivers/crypto/sahara.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index a97bb6c1596c..c3dc1c04a5df 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -863,7 +863,7 @@ static struct of_device_id sahara_dt_ids[] = { { .compatible = "fsl,imx27-sahara" }, { /* sentinel */ } }; -MODULE_DEVICE_TABLE(platform, sahara_dt_ids); +MODULE_DEVICE_TABLE(of, sahara_dt_ids); static int sahara_probe(struct platform_device *pdev) { From fe6510b5d6349a8999b83ef7c5671e5a561b803a Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 11 Jun 2013 22:25:22 +0300 Subject: [PATCH 6/6] crypto: aesni_intel - fix accessing of unaligned memory The new XTS code for aesni_intel uses input buffers directly as memory operands for pxor instructions, which causes crash if those buffers are not aligned to 16 bytes. Patch changes XTS code to handle unaligned memory correctly, by loading memory with movdqu instead. Reported-by: Dave Jones Tested-by: Dave Jones Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 48 ++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 62fe22cd4cba..477e9d75149b 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2681,56 +2681,68 @@ ENTRY(aesni_xts_crypt8) addq %rcx, KEYP movdqa IV, STATE1 - pxor 0x00(INP), STATE1 + movdqu 0x00(INP), INC + pxor INC, STATE1 movdqu IV, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - pxor 0x10(INP), STATE2 + movdqu 0x10(INP), INC + pxor INC, STATE2 movdqu IV, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - pxor 0x20(INP), STATE3 + movdqu 0x20(INP), INC + pxor INC, STATE3 movdqu IV, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - pxor 0x30(INP), STATE4 + movdqu 0x30(INP), INC + pxor INC, STATE4 movdqu IV, 0x30(OUTP) call *%r11 - pxor 0x00(OUTP), STATE1 + movdqu 0x00(OUTP), INC + pxor INC, STATE1 movdqu STATE1, 0x00(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE1 - pxor 0x40(INP), STATE1 + movdqu 0x40(INP), INC + pxor INC, STATE1 movdqu IV, 0x40(OUTP) - pxor 0x10(OUTP), STATE2 + movdqu 0x10(OUTP), INC + pxor INC, STATE2 movdqu STATE2, 0x10(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE2 - pxor 0x50(INP), STATE2 + movdqu 0x50(INP), INC + pxor INC, STATE2 movdqu IV, 0x50(OUTP) - pxor 0x20(OUTP), STATE3 + movdqu 0x20(OUTP), INC + pxor INC, STATE3 movdqu STATE3, 0x20(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE3 - pxor 0x60(INP), STATE3 + movdqu 0x60(INP), INC + pxor INC, STATE3 movdqu IV, 0x60(OUTP) - pxor 0x30(OUTP), STATE4 + movdqu 0x30(OUTP), INC + pxor INC, STATE4 movdqu STATE4, 0x30(OUTP) _aesni_gf128mul_x_ble() movdqa IV, STATE4 - pxor 0x70(INP), STATE4 + movdqu 0x70(INP), INC + pxor INC, STATE4 movdqu IV, 0x70(OUTP) _aesni_gf128mul_x_ble() @@ -2738,16 +2750,20 @@ ENTRY(aesni_xts_crypt8) call *%r11 - pxor 0x40(OUTP), STATE1 + movdqu 0x40(OUTP), INC + pxor INC, STATE1 movdqu STATE1, 0x40(OUTP) - pxor 0x50(OUTP), STATE2 + movdqu 0x50(OUTP), INC + pxor INC, STATE2 movdqu STATE2, 0x50(OUTP) - pxor 0x60(OUTP), STATE3 + movdqu 0x60(OUTP), INC + pxor INC, STATE3 movdqu STATE3, 0x60(OUTP) - pxor 0x70(OUTP), STATE4 + movdqu 0x70(OUTP), INC + pxor INC, STATE4 movdqu STATE4, 0x70(OUTP) ret