Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:
 "API:
   - Remove VLA usage
   - Add cryptostat user-space interface
   - Add notifier for new crypto algorithms

  Algorithms:
   - Add OFB mode
   - Remove speck

  Drivers:
   - Remove x86/sha*-mb as they are buggy
   - Remove pcbc(aes) from x86/aesni
   - Improve performance of arm/ghash-ce by up to 85%
   - Implement CTS-CBC in arm64/aes-blk, faster by up to 50%
   - Remove PMULL based arm64/crc32 driver
   - Use PMULL in arm64/crct10dif
   - Add aes-ctr support in s5p-sss
   - Add caam/qi2 driver

  Others:
   - Pick better transform if one becomes available in crc-t10dif"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (124 commits)
  crypto: chelsio - Update ntx queue received from cxgb4
  crypto: ccree - avoid implicit enum conversion
  crypto: caam - add SPDX license identifier to all files
  crypto: caam/qi - simplify CGR allocation, freeing
  crypto: mxs-dcp - make symbols 'sha1_null_hash' and 'sha256_null_hash' static
  crypto: arm64/aes-blk - ensure XTS mask is always loaded
  crypto: testmgr - fix sizeof() on COMP_BUF_SIZE
  crypto: chtls - remove set but not used variable 'csk'
  crypto: axis - fix platform_no_drv_owner.cocci warnings
  crypto: x86/aes-ni - fix build error following fpu template removal
  crypto: arm64/aes - fix handling sub-block CTS-CBC inputs
  crypto: caam/qi2 - avoid double export
  crypto: mxs-dcp - Fix AES issues
  crypto: mxs-dcp - Fix SHA null hashes and output length
  crypto: mxs-dcp - Implement sha import/export
  crypto: aegis/generic - fix for big endian systems
  crypto: morus/generic - fix for big endian systems
  crypto: lrw - fix rebase error after out of bounds fix
  crypto: cavium/nitrox - use pci_alloc_irq_vectors() while enabling MSI-X.
  crypto: cavium/nitrox - NITROX command queue changes.
  ...
This commit is contained in:
Linus Torvalds 2018-10-25 16:43:35 -07:00
commit 62606c224d
234 changed files with 11956 additions and 15864 deletions

View File

@ -191,21 +191,11 @@ Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames
It is strongly recommended to use AES-256-XTS for contents encryption.
AES-128-CBC was added only for low-powered embedded devices with
crypto accelerators such as CAAM or CESA that do not support XTS.
Similarly, Speck128/256 support was only added for older or low-end
CPUs which cannot do AES fast enough -- especially ARM CPUs which have
NEON instructions but not the Cryptography Extensions -- and for which
it would not otherwise be feasible to use encryption at all. It is
not recommended to use Speck on CPUs that have AES instructions.
Speck support is only available if it has been enabled in the crypto
API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get
acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled.
New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing

View File

@ -7578,14 +7578,6 @@ S: Supported
F: drivers/infiniband/hw/i40iw/
F: include/uapi/rdma/i40iw-abi.h
INTEL SHA MULTIBUFFER DRIVER
M: Megha Dey <megha.dey@linux.intel.com>
R: Tim Chen <tim.c.chen@linux.intel.com>
L: linux-crypto@vger.kernel.org
S: Supported
F: arch/x86/crypto/sha*-mb/
F: crypto/mcryptd.c
INTEL TELEMETRY DRIVER
M: Souvik Kumar Chakravarty <souvik.k.chakravarty@intel.com>
L: platform-driver-x86@vger.kernel.org

View File

@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_CRYPTD
select CRYPTO_GF128MUL
help
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@ -121,10 +122,4 @@ config CRYPTO_CHACHA20_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
config CRYPTO_SPECK_NEON
tristate "NEON accelerated Speck cipher algorithms"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_SPECK
endif

View File

@ -10,7 +10,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@ -54,7 +53,6 @@ ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
speck-neon-y := speck-neon-core.o speck-neon-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@

View File

@ -18,6 +18,34 @@
* (at your option) any later version.
*/
/*
* NEON doesn't have a rotate instruction. The alternatives are, more or less:
*
* (a) vshl.u32 + vsri.u32 (needs temporary register)
* (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
* (c) vrev32.16 (16-bit rotations only)
* (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
* needs index vector)
*
* ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit
* rotations, the only choices are (a) and (b). We use (a) since it takes
* two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
*
* For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
* and doesn't need a temporary register.
*
* For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
* is twice as fast as (a), even when doing (a) on multiple registers
* simultaneously to eliminate the stall between vshl and vsri. Also, it
* parallelizes better when temporary registers are scarce.
*
* A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
* (a), so the need to load the rotation table actually makes the vtbl method
* slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
* seems to be a good compromise to get a more significant speed boost on some
* CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
*/
#include <linux/linkage.h>
.text
@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon)
vmov q10, q2
vmov q11, q3
adr ip, .Lrol8_table
mov r3, #10
vld1.8 {d10}, [ip, :64]
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon)
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #8
vsri.u32 q3, q4, #24
veor q3, q3, q0
vtbl.8 d6, {d6}, d10
vtbl.8 d7, {d7}, d10
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon)
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #8
vsri.u32 q3, q4, #24
veor q3, q3, q0
vtbl.8 d6, {d6}, d10
vtbl.8 d7, {d7}, d10
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon)
bx lr
ENDPROC(chacha20_block_xor_neon)
.align 4
.Lctrinc: .word 0, 1, 2, 3
.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
.align 5
ENTRY(chacha20_4block_xor_neon)
push {r4-r6, lr}
mov ip, sp // preserve the stack pointer
sub r3, sp, #0x20 // allocate a 32 byte buffer
bic r3, r3, #0x1f // aligned to 32 bytes
mov sp, r3
push {r4-r5}
mov r4, sp // preserve the stack pointer
sub ip, sp, #0x20 // allocate a 32 byte buffer
bic ip, ip, #0x1f // aligned to 32 bytes
mov sp, ip
// r0: Input state matrix, s
// r1: 4 data blocks output, o
@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon)
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
// requires no word shuffling. The words are re-interleaved before the
// final addition of the original state and the XORing step.
//
// x0..15[0-3] = s0..3[0..3]
add r3, r0, #0x20
// x0..15[0-3] = s0..15[0-3]
add ip, r0, #0x20
vld1.32 {q0-q1}, [r0]
vld1.32 {q2-q3}, [r3]
vld1.32 {q2-q3}, [ip]
adr r3, CTRINC
adr r5, .Lctrinc
vdup.32 q15, d7[1]
vdup.32 q14, d7[0]
vld1.32 {q11}, [r3, :128]
vld1.32 {q4}, [r5, :128]
vdup.32 q13, d6[1]
vdup.32 q12, d6[0]
vadd.i32 q12, q12, q11 // x12 += counter values 0-3
vdup.32 q11, d5[1]
vdup.32 q10, d5[0]
vadd.u32 q12, q12, q4 // x12 += counter values 0-3
vdup.32 q9, d4[1]
vdup.32 q8, d4[0]
vdup.32 q7, d3[1]
@ -185,9 +218,13 @@ ENTRY(chacha20_4block_xor_neon)
vdup.32 q1, d0[1]
vdup.32 q0, d0[0]
adr ip, .Lrol8_table
mov r3, #10
b 1f
.Ldoubleround4:
vld1.32 {q8-q9}, [sp, :256]
1:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
@ -236,24 +273,25 @@ ENTRY(chacha20_4block_xor_neon)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vld1.8 {d16}, [ip, :64]
vadd.i32 q0, q0, q4
vadd.i32 q1, q1, q5
vadd.i32 q2, q2, q6
vadd.i32 q3, q3, q7
veor q8, q12, q0
veor q9, q13, q1
vshl.u32 q12, q8, #8
vshl.u32 q13, q9, #8
vsri.u32 q12, q8, #24
vsri.u32 q13, q9, #24
veor q12, q12, q0
veor q13, q13, q1
veor q14, q14, q2
veor q15, q15, q3
veor q8, q14, q2
veor q9, q15, q3
vshl.u32 q14, q8, #8
vshl.u32 q15, q9, #8
vsri.u32 q14, q8, #24
vsri.u32 q15, q9, #24
vtbl.8 d24, {d24}, d16
vtbl.8 d25, {d25}, d16
vtbl.8 d26, {d26}, d16
vtbl.8 d27, {d27}, d16
vtbl.8 d28, {d28}, d16
vtbl.8 d29, {d29}, d16
vtbl.8 d30, {d30}, d16
vtbl.8 d31, {d31}, d16
vld1.32 {q8-q9}, [sp, :256]
@ -332,24 +370,25 @@ ENTRY(chacha20_4block_xor_neon)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vld1.8 {d16}, [ip, :64]
vadd.i32 q0, q0, q5
vadd.i32 q1, q1, q6
vadd.i32 q2, q2, q7
vadd.i32 q3, q3, q4
veor q8, q15, q0
veor q9, q12, q1
vshl.u32 q15, q8, #8
vshl.u32 q12, q9, #8
vsri.u32 q15, q8, #24
vsri.u32 q12, q9, #24
veor q15, q15, q0
veor q12, q12, q1
veor q13, q13, q2
veor q14, q14, q3
veor q8, q13, q2
veor q9, q14, q3
vshl.u32 q13, q8, #8
vshl.u32 q14, q9, #8
vsri.u32 q13, q8, #24
vsri.u32 q14, q9, #24
vtbl.8 d30, {d30}, d16
vtbl.8 d31, {d31}, d16
vtbl.8 d24, {d24}, d16
vtbl.8 d25, {d25}, d16
vtbl.8 d26, {d26}, d16
vtbl.8 d27, {d27}, d16
vtbl.8 d28, {d28}, d16
vtbl.8 d29, {d29}, d16
vld1.32 {q8-q9}, [sp, :256]
@ -379,104 +418,76 @@ ENTRY(chacha20_4block_xor_neon)
vsri.u32 q6, q9, #25
subs r3, r3, #1
beq 0f
bne .Ldoubleround4
vld1.32 {q8-q9}, [sp, :256]
b .Ldoubleround4
// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
// x8..9[0-3] are on the stack.
// x0[0-3] += s0[0]
// x1[0-3] += s0[1]
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
0: ldmia r0!, {r3-r6}
vdup.32 q8, r3
vdup.32 q9, r4
vadd.i32 q0, q0, q8
vadd.i32 q1, q1, q9
vdup.32 q8, r5
vdup.32 q9, r6
vadd.i32 q2, q2, q8
vadd.i32 q3, q3, q9
// x4[0-3] += s1[0]
// x5[0-3] += s1[1]
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
ldmia r0!, {r3-r6}
vdup.32 q8, r3
vdup.32 q9, r4
vadd.i32 q4, q4, q8
vadd.i32 q5, q5, q9
vdup.32 q8, r5
vdup.32 q9, r6
vadd.i32 q6, q6, q8
vadd.i32 q7, q7, q9
// interleave 32-bit words in state n, n+1
vzip.32 q0, q1
vzip.32 q2, q3
vzip.32 q4, q5
vzip.32 q6, q7
// interleave 64-bit words in state n, n+2
// Re-interleave the words in the first two rows of each block (x0..7).
// Also add the counter values 0-3 to x12[0-3].
vld1.32 {q8}, [r5, :128] // load counter values 0-3
vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
vadd.u32 q12, q8 // x12 += counter values 0-3
vswp d1, d4
vswp d3, d6
vld1.32 {q8-q9}, [r0]! // load s0..7
vswp d9, d12
vswp d11, d14
// xor with corresponding input, write to output
// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
// after XORing the first 32 bytes.
vswp q1, q4
// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
// x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
vadd.u32 q0, q0, q8
vadd.u32 q2, q2, q8
vadd.u32 q4, q4, q8
vadd.u32 q3, q3, q8
// x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
vadd.u32 q1, q1, q9
vadd.u32 q6, q6, q9
vadd.u32 q5, q5, q9
vadd.u32 q7, q7, q9
// XOR first 32 bytes using keystream from first two rows of first block
vld1.8 {q8-q9}, [r2]!
veor q8, q8, q0
veor q9, q9, q4
veor q9, q9, q1
vst1.8 {q8-q9}, [r1]!
// Re-interleave the words in the last two rows of each block (x8..15).
vld1.32 {q8-q9}, [sp, :256]
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
ldmia r0!, {r3-r6}
vdup.32 q0, r3
vdup.32 q4, r4
vadd.i32 q8, q8, q0
vadd.i32 q9, q9, q4
vdup.32 q0, r5
vdup.32 q4, r6
vadd.i32 q10, q10, q0
vadd.i32 q11, q11, q4
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
ldmia r0!, {r3-r6}
vdup.32 q0, r3
vdup.32 q4, r4
adr r3, CTRINC
vadd.i32 q12, q12, q0
vld1.32 {q0}, [r3, :128]
vadd.i32 q13, q13, q4
vadd.i32 q12, q12, q0 // x12 += counter values 0-3
vdup.32 q0, r5
vdup.32 q4, r6
vadd.i32 q14, q14, q0
vadd.i32 q15, q15, q4
// interleave 32-bit words in state n, n+1
vzip.32 q8, q9
vzip.32 q10, q11
vzip.32 q12, q13
vzip.32 q14, q15
// interleave 64-bit words in state n, n+2
vswp d17, d20
vswp d19, d22
vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
vld1.32 {q0-q1}, [r0] // load s8..15
vswp d25, d28
vswp d27, d30
vswp d17, d20
vswp d19, d22
vmov q4, q1
// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
// x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
vadd.u32 q8, q8, q0
vadd.u32 q10, q10, q0
vadd.u32 q9, q9, q0
vadd.u32 q11, q11, q0
// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
vadd.u32 q12, q12, q1
vadd.u32 q14, q14, q1
vadd.u32 q13, q13, q1
vadd.u32 q15, q15, q1
// XOR the rest of the data with the keystream
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q8
@ -509,13 +520,11 @@ ENTRY(chacha20_4block_xor_neon)
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]
mov sp, r4 // restore original stack pointer
veor q0, q0, q11
veor q1, q1, q15
vst1.8 {q0-q1}, [r1]
mov sp, ip
pop {r4-r6, pc}
pop {r4-r5}
bx lr
ENDPROC(chacha20_4block_xor_neon)
.align 4
CTRINC: .word 0, 1, 2, 3

View File

@ -236,7 +236,7 @@ static void __exit crc32_pmull_mod_exit(void)
ARRAY_SIZE(crc32_pmull_algs));
}
static const struct cpu_feature crc32_cpu_feature[] = {
static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = {
{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);

View File

@ -63,6 +63,33 @@
k48 .req d31
SHASH2_p64 .req d31
HH .req q10
HH3 .req q11
HH4 .req q12
HH34 .req q13
HH_L .req d20
HH_H .req d21
HH3_L .req d22
HH3_H .req d23
HH4_L .req d24
HH4_H .req d25
HH34_L .req d26
HH34_H .req d27
SHASH2_H .req d29
XL2 .req q5
XM2 .req q6
XH2 .req q7
T3 .req q8
XL2_L .req d10
XL2_H .req d11
XM2_L .req d12
XM2_H .req d13
T3_L .req d16
T3_H .req d17
.text
.fpu crypto-neon-fp-armv8
@ -175,12 +202,77 @@
beq 0f
vld1.64 {T1}, [ip]
teq r0, #0
b 1f
b 3f
0: vld1.64 {T1}, [r2]!
0: .ifc \pn, p64
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T3-T2}, [r2]!
vrev64.8 XL2, XL2
vrev64.8 XM2, XM2
subs r0, r0, #4
vext.8 T1, XL2, XL2, #8
veor XL2_H, XL2_H, XL_L
veor XL, XL, T1
vrev64.8 T3, T3
vrev64.8 T1, T2
vmull.p64 XH, HH4_H, XL_H // a1 * b1
veor XL2_H, XL2_H, XL_H
vmull.p64 XL, HH4_L, XL_L // a0 * b0
vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
veor XM2_L, XM2_L, XM2_H
vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
vmull.p64 XH2, HH_H, T3_L // a1 * b1
veor T3_L, T3_L, T3_H
vmull.p64 XL2, HH_L, T3_H // a0 * b0
vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
veor T1_L, T1_L, T1_H
vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
veor XH, XH, XH2
veor XL, XL, XL2
veor XM, XM, XM2
beq 4f
vld1.8 {XL2-XM2}, [r2]!
veor T1, XL, XH
veor XM, XM, T1
__pmull_reduce_p64
veor T1, T1, XH
veor XL, XL, T1
b 1b
.endif
2: vld1.64 {T1}, [r2]!
subs r0, r0, #1
1: /* multiply XL by SHASH in GF(2^128) */
3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
#endif
@ -193,7 +285,7 @@
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
veor T1, XL, XH
4: veor T1, XL, XH
veor XM, XM, T1
__pmull_reduce_\pn
@ -212,8 +304,14 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
vld1.64 {SHASH}, [r3]
vld1.64 {SHASH}, [r3]!
vld1.64 {HH}, [r3]!
vld1.64 {HH3-HH4}, [r3]
veor SHASH2_p64, SHASH_L, SHASH_H
veor SHASH2_H, HH_L, HH_H
veor HH34_L, HH3_L, HH3_H
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57

View File

@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
*
* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
u64 a;
u64 b;
u64 h[2];
u64 h2[2];
u64 h3[2];
u64 h4[2];
};
struct ghash_desc_ctx {
@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0;
}
static void ghash_reflect(u64 h[], const be128 *k)
{
u64 carry = be64_to_cpu(k->a) >> 63;
h[0] = (be64_to_cpu(k->b) << 1) | carry;
h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
if (carry)
h[1] ^= 0xc200000000000000UL;
}
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
u64 a, b;
be128 h, k;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
/* perform multiplication by 'x' in GF(2^128) */
b = get_unaligned_be64(inkey);
a = get_unaligned_be64(inkey + 8);
memcpy(&k, inkey, GHASH_BLOCK_SIZE);
ghash_reflect(key->h, &k);
key->a = (a << 1) | (b >> 63);
key->b = (b << 1) | (a >> 63);
h = k;
gf128mul_lle(&h, &k);
ghash_reflect(key->h2, &h);
if (b >> 63)
key->b ^= 0xc200000000000000UL;
gf128mul_lle(&h, &k);
ghash_reflect(key->h3, &h);
gf128mul_lle(&h, &k);
ghash_reflect(key->h4, &h);
return 0;
}

View File

@ -1,434 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
*
* Copyright (c) 2018 Google, Inc
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
.text
.fpu neon
// arguments
ROUND_KEYS .req r0 // const {u64,u32} *round_keys
NROUNDS .req r1 // int nrounds
DST .req r2 // void *dst
SRC .req r3 // const void *src
NBYTES .req r4 // unsigned int nbytes
TWEAK .req r5 // void *tweak
// registers which hold the data being encrypted/decrypted
X0 .req q0
X0_L .req d0
X0_H .req d1
Y0 .req q1
Y0_H .req d3
X1 .req q2
X1_L .req d4
X1_H .req d5
Y1 .req q3
Y1_H .req d7
X2 .req q4
X2_L .req d8
X2_H .req d9
Y2 .req q5
Y2_H .req d11
X3 .req q6
X3_L .req d12
X3_H .req d13
Y3 .req q7
Y3_H .req d15
// the round key, duplicated in all lanes
ROUND_KEY .req q8
ROUND_KEY_L .req d16
ROUND_KEY_H .req d17
// index vector for vtbl-based 8-bit rotates
ROTATE_TABLE .req d18
// multiplication table for updating XTS tweaks
GF128MUL_TABLE .req d19
GF64MUL_TABLE .req d19
// current XTS tweak value(s)
TWEAKV .req q10
TWEAKV_L .req d20
TWEAKV_H .req d21
TMP0 .req q12
TMP0_L .req d24
TMP0_H .req d25
TMP1 .req q13
TMP2 .req q14
TMP3 .req q15
.align 4
.Lror64_8_table:
.byte 1, 2, 3, 4, 5, 6, 7, 0
.Lror32_8_table:
.byte 1, 2, 3, 0, 5, 6, 7, 4
.Lrol64_8_table:
.byte 7, 0, 1, 2, 3, 4, 5, 6
.Lrol32_8_table:
.byte 3, 0, 1, 2, 7, 4, 5, 6
.Lgf128mul_table:
.byte 0, 0x87
.fill 14
.Lgf64mul_table:
.byte 0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
.fill 12
/*
* _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
*
* Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
* Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
* of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
*
* The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
* the vtbl approach is faster on some processors and the same speed on others.
*/
.macro _speck_round_128bytes n
// x = ror(x, 8)
vtbl.8 X0_L, {X0_L}, ROTATE_TABLE
vtbl.8 X0_H, {X0_H}, ROTATE_TABLE
vtbl.8 X1_L, {X1_L}, ROTATE_TABLE
vtbl.8 X1_H, {X1_H}, ROTATE_TABLE
vtbl.8 X2_L, {X2_L}, ROTATE_TABLE
vtbl.8 X2_H, {X2_H}, ROTATE_TABLE
vtbl.8 X3_L, {X3_L}, ROTATE_TABLE
vtbl.8 X3_H, {X3_H}, ROTATE_TABLE
// x += y
vadd.u\n X0, Y0
vadd.u\n X1, Y1
vadd.u\n X2, Y2
vadd.u\n X3, Y3
// x ^= k
veor X0, ROUND_KEY
veor X1, ROUND_KEY
veor X2, ROUND_KEY
veor X3, ROUND_KEY
// y = rol(y, 3)
vshl.u\n TMP0, Y0, #3
vshl.u\n TMP1, Y1, #3
vshl.u\n TMP2, Y2, #3
vshl.u\n TMP3, Y3, #3
vsri.u\n TMP0, Y0, #(\n - 3)
vsri.u\n TMP1, Y1, #(\n - 3)
vsri.u\n TMP2, Y2, #(\n - 3)
vsri.u\n TMP3, Y3, #(\n - 3)
// y ^= x
veor Y0, TMP0, X0
veor Y1, TMP1, X1
veor Y2, TMP2, X2
veor Y3, TMP3, X3
.endm
/*
* _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
*
* This is the inverse of _speck_round_128bytes().
*/
.macro _speck_unround_128bytes n
// y ^= x
veor TMP0, Y0, X0
veor TMP1, Y1, X1
veor TMP2, Y2, X2
veor TMP3, Y3, X3
// y = ror(y, 3)
vshr.u\n Y0, TMP0, #3
vshr.u\n Y1, TMP1, #3
vshr.u\n Y2, TMP2, #3
vshr.u\n Y3, TMP3, #3
vsli.u\n Y0, TMP0, #(\n - 3)
vsli.u\n Y1, TMP1, #(\n - 3)
vsli.u\n Y2, TMP2, #(\n - 3)
vsli.u\n Y3, TMP3, #(\n - 3)
// x ^= k
veor X0, ROUND_KEY
veor X1, ROUND_KEY
veor X2, ROUND_KEY
veor X3, ROUND_KEY
// x -= y
vsub.u\n X0, Y0
vsub.u\n X1, Y1
vsub.u\n X2, Y2
vsub.u\n X3, Y3
// x = rol(x, 8);
vtbl.8 X0_L, {X0_L}, ROTATE_TABLE
vtbl.8 X0_H, {X0_H}, ROTATE_TABLE
vtbl.8 X1_L, {X1_L}, ROTATE_TABLE
vtbl.8 X1_H, {X1_H}, ROTATE_TABLE
vtbl.8 X2_L, {X2_L}, ROTATE_TABLE
vtbl.8 X2_H, {X2_H}, ROTATE_TABLE
vtbl.8 X3_L, {X3_L}, ROTATE_TABLE
vtbl.8 X3_H, {X3_H}, ROTATE_TABLE
.endm
.macro _xts128_precrypt_one dst_reg, tweak_buf, tmp
// Load the next source block
vld1.8 {\dst_reg}, [SRC]!
// Save the current tweak in the tweak buffer
vst1.8 {TWEAKV}, [\tweak_buf:128]!
// XOR the next source block with the current tweak
veor \dst_reg, TWEAKV
/*
* Calculate the next tweak by multiplying the current one by x,
* modulo p(x) = x^128 + x^7 + x^2 + x + 1.
*/
vshr.u64 \tmp, TWEAKV, #63
vshl.u64 TWEAKV, #1
veor TWEAKV_H, \tmp\()_L
vtbl.8 \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
veor TWEAKV_L, \tmp\()_H
.endm
.macro _xts64_precrypt_two dst_reg, tweak_buf, tmp
// Load the next two source blocks
vld1.8 {\dst_reg}, [SRC]!
// Save the current two tweaks in the tweak buffer
vst1.8 {TWEAKV}, [\tweak_buf:128]!
// XOR the next two source blocks with the current two tweaks
veor \dst_reg, TWEAKV
/*
* Calculate the next two tweaks by multiplying the current ones by x^2,
* modulo p(x) = x^64 + x^4 + x^3 + x + 1.
*/
vshr.u64 \tmp, TWEAKV, #62
vshl.u64 TWEAKV, #2
vtbl.8 \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
vtbl.8 \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
veor TWEAKV, \tmp
.endm
/*
* _speck_xts_crypt() - Speck-XTS encryption/decryption
*
* Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
* using Speck-XTS, specifically the variant with a block size of '2n' and round
* count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
* the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
* nonzero multiple of 128.
*/
.macro _speck_xts_crypt n, decrypting
push {r4-r7}
mov r7, sp
/*
* The first four parameters were passed in registers r0-r3. Load the
* additional parameters, which were passed on the stack.
*/
ldr NBYTES, [sp, #16]
ldr TWEAK, [sp, #20]
/*
* If decrypting, modify the ROUND_KEYS parameter to point to the last
* round key rather than the first, since for decryption the round keys
* are used in reverse order.
*/
.if \decrypting
.if \n == 64
add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
sub ROUND_KEYS, #8
.else
add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
sub ROUND_KEYS, #4
.endif
.endif
// Load the index vector for vtbl-based 8-bit rotates
.if \decrypting
ldr r12, =.Lrol\n\()_8_table
.else
ldr r12, =.Lror\n\()_8_table
.endif
vld1.8 {ROTATE_TABLE}, [r12:64]
// One-time XTS preparation
/*
* Allocate stack space to store 128 bytes worth of tweaks. For
* performance, this space is aligned to a 16-byte boundary so that we
* can use the load/store instructions that declare 16-byte alignment.
* For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
*/
sub r12, sp, #128
bic r12, #0xf
mov sp, r12
.if \n == 64
// Load first tweak
vld1.8 {TWEAKV}, [TWEAK]
// Load GF(2^128) multiplication table
ldr r12, =.Lgf128mul_table
vld1.8 {GF128MUL_TABLE}, [r12:64]
.else
// Load first tweak
vld1.8 {TWEAKV_L}, [TWEAK]
// Load GF(2^64) multiplication table
ldr r12, =.Lgf64mul_table
vld1.8 {GF64MUL_TABLE}, [r12:64]
// Calculate second tweak, packing it together with the first
vshr.u64 TMP0_L, TWEAKV_L, #63
vtbl.u8 TMP0_L, {GF64MUL_TABLE}, TMP0_L
vshl.u64 TWEAKV_H, TWEAKV_L, #1
veor TWEAKV_H, TMP0_L
.endif
.Lnext_128bytes_\@:
/*
* Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
* values, and save the tweaks on the stack for later. Then
* de-interleave the 'x' and 'y' elements of each block, i.e. make it so
* that the X[0-3] registers contain only the second halves of blocks,
* and the Y[0-3] registers contain only the first halves of blocks.
* (Speck uses the order (y, x) rather than the more intuitive (x, y).)
*/
mov r12, sp
.if \n == 64
_xts128_precrypt_one X0, r12, TMP0
_xts128_precrypt_one Y0, r12, TMP0
_xts128_precrypt_one X1, r12, TMP0
_xts128_precrypt_one Y1, r12, TMP0
_xts128_precrypt_one X2, r12, TMP0
_xts128_precrypt_one Y2, r12, TMP0
_xts128_precrypt_one X3, r12, TMP0
_xts128_precrypt_one Y3, r12, TMP0
vswp X0_L, Y0_H
vswp X1_L, Y1_H
vswp X2_L, Y2_H
vswp X3_L, Y3_H
.else
_xts64_precrypt_two X0, r12, TMP0
_xts64_precrypt_two Y0, r12, TMP0
_xts64_precrypt_two X1, r12, TMP0
_xts64_precrypt_two Y1, r12, TMP0
_xts64_precrypt_two X2, r12, TMP0
_xts64_precrypt_two Y2, r12, TMP0
_xts64_precrypt_two X3, r12, TMP0
_xts64_precrypt_two Y3, r12, TMP0
vuzp.32 Y0, X0
vuzp.32 Y1, X1
vuzp.32 Y2, X2
vuzp.32 Y3, X3
.endif
// Do the cipher rounds
mov r12, ROUND_KEYS
mov r6, NROUNDS
.Lnext_round_\@:
.if \decrypting
.if \n == 64
vld1.64 ROUND_KEY_L, [r12]
sub r12, #8
vmov ROUND_KEY_H, ROUND_KEY_L
.else
vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
sub r12, #4
.endif
_speck_unround_128bytes \n
.else
.if \n == 64
vld1.64 ROUND_KEY_L, [r12]!
vmov ROUND_KEY_H, ROUND_KEY_L
.else
vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
.endif
_speck_round_128bytes \n
.endif
subs r6, r6, #1
bne .Lnext_round_\@
// Re-interleave the 'x' and 'y' elements of each block
.if \n == 64
vswp X0_L, Y0_H
vswp X1_L, Y1_H
vswp X2_L, Y2_H
vswp X3_L, Y3_H
.else
vzip.32 Y0, X0
vzip.32 Y1, X1
vzip.32 Y2, X2
vzip.32 Y3, X3
.endif
// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
mov r12, sp
vld1.8 {TMP0, TMP1}, [r12:128]!
vld1.8 {TMP2, TMP3}, [r12:128]!
veor X0, TMP0
veor Y0, TMP1
veor X1, TMP2
veor Y1, TMP3
vld1.8 {TMP0, TMP1}, [r12:128]!
vld1.8 {TMP2, TMP3}, [r12:128]!
veor X2, TMP0
veor Y2, TMP1
veor X3, TMP2
veor Y3, TMP3
// Store the ciphertext in the destination buffer
vst1.8 {X0, Y0}, [DST]!
vst1.8 {X1, Y1}, [DST]!
vst1.8 {X2, Y2}, [DST]!
vst1.8 {X3, Y3}, [DST]!
// Continue if there are more 128-byte chunks remaining, else return
subs NBYTES, #128
bne .Lnext_128bytes_\@
// Store the next tweak
.if \n == 64
vst1.8 {TWEAKV}, [TWEAK]
.else
vst1.8 {TWEAKV_L}, [TWEAK]
.endif
mov sp, r7
pop {r4-r7}
bx lr
.endm
ENTRY(speck128_xts_encrypt_neon)
_speck_xts_crypt n=64, decrypting=0
ENDPROC(speck128_xts_encrypt_neon)
ENTRY(speck128_xts_decrypt_neon)
_speck_xts_crypt n=64, decrypting=1
ENDPROC(speck128_xts_decrypt_neon)
ENTRY(speck64_xts_encrypt_neon)
_speck_xts_crypt n=32, decrypting=0
ENDPROC(speck64_xts_encrypt_neon)
ENTRY(speck64_xts_decrypt_neon)
_speck_xts_crypt n=32, decrypting=1
ENDPROC(speck64_xts_decrypt_neon)

View File

@ -1,288 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
*
* Copyright (c) 2018 Google, Inc
*
* Note: the NIST recommendation for XTS only specifies a 128-bit block size,
* but a 64-bit version (needed for Speck64) is fairly straightforward; the math
* is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
* x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
* "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
* OCB and PMAC"), represented as 0x1B.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/algapi.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/skcipher.h>
#include <crypto/speck.h>
#include <crypto/xts.h>
#include <linux/kernel.h>
#include <linux/module.h>
/* The assembly functions only handle multiples of 128 bytes */
#define SPECK_NEON_CHUNK_SIZE 128
/* Speck128 */
struct speck128_xts_tfm_ctx {
struct speck128_tfm_ctx main_key;
struct speck128_tfm_ctx tweak_key;
};
asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck128_xts_crypt(struct skcipher_request *req,
speck128_crypt_one_t crypt_one,
speck128_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
le128 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
le128_xor((le128 *)dst, (const le128 *)src, &tweak);
(*crypt_one)(&ctx->main_key, dst, dst);
le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
gf128mul_x_ble(&tweak, &tweak);
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck128_xts_encrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_encrypt,
speck128_xts_encrypt_neon);
}
static int speck128_xts_decrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_decrypt,
speck128_xts_decrypt_neon);
}
static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
}
/* Speck64 */
struct speck64_xts_tfm_ctx {
struct speck64_tfm_ctx main_key;
struct speck64_tfm_ctx tweak_key;
};
asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
speck64_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
__le64 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
*(__le64 *)dst = *(__le64 *)src ^ tweak;
(*crypt_one)(&ctx->main_key, dst, dst);
*(__le64 *)dst ^= tweak;
tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
((tweak & cpu_to_le64(1ULL << 63)) ?
0x1B : 0));
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck64_xts_encrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_encrypt,
speck64_xts_encrypt_neon);
}
static int speck64_xts_decrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_decrypt,
speck64_xts_decrypt_neon);
}
static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
}
static struct skcipher_alg speck_algs[] = {
{
.base.cra_name = "xts(speck128)",
.base.cra_driver_name = "xts-speck128-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK128_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK128_128_KEY_SIZE,
.max_keysize = 2 * SPECK128_256_KEY_SIZE,
.ivsize = SPECK128_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck128_xts_setkey,
.encrypt = speck128_xts_encrypt,
.decrypt = speck128_xts_decrypt,
}, {
.base.cra_name = "xts(speck64)",
.base.cra_driver_name = "xts-speck64-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK64_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK64_96_KEY_SIZE,
.max_keysize = 2 * SPECK64_128_KEY_SIZE,
.ivsize = SPECK64_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck64_xts_setkey,
.encrypt = speck64_xts_encrypt,
.decrypt = speck64_xts_decrypt,
}
};
static int __init speck_neon_module_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
static void __exit speck_neon_module_exit(void)
{
crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
module_init(speck_neon_module_init);
module_exit(speck_neon_module_exit);
MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("xts(speck128)");
MODULE_ALIAS_CRYPTO("xts-speck128-neon");
MODULE_ALIAS_CRYPTO("xts(speck64)");
MODULE_ALIAS_CRYPTO("xts-speck64-neon");

View File

@ -698,6 +698,7 @@ CONFIG_MEMTEST=y
CONFIG_SECURITY=y
CONFIG_CRYPTO_ECHAINIV=y
CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=y
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
@ -706,7 +707,6 @@ CONFIG_CRYPTO_SHA3_ARM64=m
CONFIG_CRYPTO_SM3_ARM64_CE=m
CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
CONFIG_CRYPTO_CRC32_ARM64_CE=m
CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_CRYPTO_CHACHA20_NEON=m

View File

@ -66,11 +66,6 @@ config CRYPTO_CRCT10DIF_ARM64_CE
depends on KERNEL_MODE_NEON && CRC_T10DIF
select CRYPTO_HASH
config CRYPTO_CRC32_ARM64_CE
tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
depends on CRC32
select CRYPTO_HASH
config CRYPTO_AES_ARM64
tristate "AES core cipher using scalar instructions"
select CRYPTO_AES
@ -119,10 +114,4 @@ config CRYPTO_AES_ARM64_BS
select CRYPTO_AES_ARM64
select CRYPTO_SIMD
config CRYPTO_SPECK_NEON
tristate "NEON accelerated Speck cipher algorithms"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_SPECK
endif

View File

@ -32,9 +32,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
@ -56,9 +53,6 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
speck-neon-y := speck-neon-core.o speck-neon-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o

View File

@ -17,6 +17,11 @@
.arch armv8-a+crypto
xtsmask .req v16
.macro xts_reload_mask, tmp
.endm
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12

View File

@ -15,6 +15,7 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <crypto/xts.h>
@ -31,6 +32,8 @@
#define aes_ecb_decrypt ce_aes_ecb_decrypt
#define aes_cbc_encrypt ce_aes_cbc_encrypt
#define aes_cbc_decrypt ce_aes_cbc_decrypt
#define aes_cbc_cts_encrypt ce_aes_cbc_cts_encrypt
#define aes_cbc_cts_decrypt ce_aes_cbc_cts_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
@ -45,6 +48,8 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
#define aes_cbc_decrypt neon_aes_cbc_decrypt
#define aes_cbc_cts_encrypt neon_aes_cbc_cts_encrypt
#define aes_cbc_cts_decrypt neon_aes_cbc_cts_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
@ -63,30 +68,41 @@ MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
/* defined in aes-modes.S */
asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 ctr[]);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
int rounds, int blocks, u8 const rk2[], u8 iv[],
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u32 const rk2[], u8 iv[],
int first);
asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
int rounds, int blocks, u8 const rk2[], u8 iv[],
asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u32 const rk2[], u8 iv[],
int first);
asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
struct cts_cbc_req_ctx {
struct scatterlist sg_src[2];
struct scatterlist sg_dst[2];
struct skcipher_request subreq;
};
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
@ -142,7 +158,7 @@ static int ecb_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks);
ctx->key_enc, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -162,7 +178,7 @@ static int ecb_decrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_dec, rounds, blocks);
ctx->key_dec, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -182,7 +198,7 @@ static int cbc_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -202,13 +218,149 @@ static int cbc_decrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_dec, rounds, blocks, walk.iv);
ctx->key_dec, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err;
}
static int cts_cbc_init_tfm(struct crypto_skcipher *tfm)
{
crypto_skcipher_set_reqsize(tfm, sizeof(struct cts_cbc_req_ctx));
return 0;
}
static int cts_cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
int err, rounds = 6 + ctx->key_length / 4;
int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
struct scatterlist *src = req->src, *dst = req->dst;
struct skcipher_walk walk;
skcipher_request_set_tfm(&rctx->subreq, tfm);
if (req->cryptlen <= AES_BLOCK_SIZE) {
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
cbc_blocks = 1;
}
if (cbc_blocks > 0) {
unsigned int blocks;
skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &rctx->subreq, false);
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
if (err)
return err;
if (req->cryptlen == AES_BLOCK_SIZE)
return 0;
dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
rctx->subreq.cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
rctx->subreq.cryptlen);
}
/* handle ciphertext stealing */
skcipher_request_set_crypt(&rctx->subreq, src, dst,
req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &rctx->subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_enc, rounds, walk.nbytes, walk.iv);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
}
static int cts_cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct cts_cbc_req_ctx *rctx = skcipher_request_ctx(req);
int err, rounds = 6 + ctx->key_length / 4;
int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
struct scatterlist *src = req->src, *dst = req->dst;
struct skcipher_walk walk;
skcipher_request_set_tfm(&rctx->subreq, tfm);
if (req->cryptlen <= AES_BLOCK_SIZE) {
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
cbc_blocks = 1;
}
if (cbc_blocks > 0) {
unsigned int blocks;
skcipher_request_set_crypt(&rctx->subreq, req->src, req->dst,
cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &rctx->subreq, false);
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_dec, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
if (err)
return err;
if (req->cryptlen == AES_BLOCK_SIZE)
return 0;
dst = src = scatterwalk_ffwd(rctx->sg_src, req->src,
rctx->subreq.cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(rctx->sg_dst, req->dst,
rctx->subreq.cryptlen);
}
/* handle ciphertext stealing */
skcipher_request_set_crypt(&rctx->subreq, src, dst,
req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &rctx->subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_dec, rounds, walk.nbytes, walk.iv);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
}
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@ -222,7 +374,7 @@ static int ctr_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -238,7 +390,7 @@ static int ctr_encrypt(struct skcipher_request *req)
blocks = -1;
kernel_neon_begin();
aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
blocks, walk.iv);
kernel_neon_end();
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
@ -272,8 +424,8 @@ static int xts_encrypt(struct skcipher_request *req)
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key1.key_enc, rounds, blocks,
(u8 *)ctx->key2.key_enc, walk.iv, first);
ctx->key1.key_enc, rounds, blocks,
ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -294,8 +446,8 @@ static int xts_decrypt(struct skcipher_request *req)
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key1.key_dec, rounds, blocks,
(u8 *)ctx->key2.key_enc, walk.iv, first);
ctx->key1.key_dec, rounds, blocks,
ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
@ -334,6 +486,24 @@ static struct skcipher_alg aes_algs[] = { {
.setkey = skcipher_aes_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
}, {
.base = {
.cra_name = "__cts(cbc(aes))",
.cra_driver_name = "__cts-cbc-aes-" MODE,
.cra_priority = PRIO,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
.encrypt = cts_cbc_encrypt,
.decrypt = cts_cbc_decrypt,
.init = cts_cbc_init_tfm,
}, {
.base = {
.cra_name = "__ctr(aes)",
@ -412,7 +582,6 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
be128 *consts = (be128 *)ctx->consts;
u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
int err;
@ -422,7 +591,8 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
/* encrypt the zero vector */
kernel_neon_begin();
aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
rounds, 1);
kernel_neon_end();
cmac_gf128_mul_by_x(consts, consts);
@ -441,7 +611,6 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
};
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
u8 key[AES_BLOCK_SIZE];
int err;
@ -451,8 +620,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
return err;
kernel_neon_begin();
aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
kernel_neon_end();
return cbcmac_setkey(tfm, key, sizeof(key));

View File

@ -14,12 +14,12 @@
.align 4
aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
ENDPROC(aes_encrypt_block4x)
aes_decrypt_block4x:
decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
ENDPROC(aes_decrypt_block4x)
@ -31,71 +31,57 @@ ENDPROC(aes_decrypt_block4x)
*/
AES_ENTRY(aes_ecb_encrypt)
frame_push 5
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
.Lecbencrestart:
enc_prepare w22, x21, x5
enc_prepare w3, x2, x5
.LecbencloopNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lecbenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
bl aes_encrypt_block4x
st1 {v0.16b-v3.16b}, [x19], #64
cond_yield_neon .Lecbencrestart
st1 {v0.16b-v3.16b}, [x0], #64
b .LecbencloopNx
.Lecbenc1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lecbencout
.Lecbencloop:
ld1 {v0.16b}, [x20], #16 /* get next pt block */
encrypt_block v0, w22, x21, x5, w6
st1 {v0.16b}, [x19], #16
subs w23, w23, #1
ld1 {v0.16b}, [x1], #16 /* get next pt block */
encrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
frame_pop
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_decrypt)
frame_push 5
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
.Lecbdecrestart:
dec_prepare w22, x21, x5
dec_prepare w3, x2, x5
.LecbdecloopNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lecbdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
bl aes_decrypt_block4x
st1 {v0.16b-v3.16b}, [x19], #64
cond_yield_neon .Lecbdecrestart
st1 {v0.16b-v3.16b}, [x0], #64
b .LecbdecloopNx
.Lecbdec1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lecbdecout
.Lecbdecloop:
ld1 {v0.16b}, [x20], #16 /* get next ct block */
decrypt_block v0, w22, x21, x5, w6
st1 {v0.16b}, [x19], #16
subs w23, w23, #1
ld1 {v0.16b}, [x1], #16 /* get next ct block */
decrypt_block v0, w3, x2, x5, w6
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
frame_pop
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_ecb_decrypt)
@ -108,162 +94,211 @@ AES_ENDPROC(aes_ecb_decrypt)
*/
AES_ENTRY(aes_cbc_encrypt)
frame_push 6
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lcbcencrestart:
ld1 {v4.16b}, [x24] /* get iv */
enc_prepare w22, x21, x6
ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
.Lcbcencloop4x:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lcbcenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
encrypt_block v0, w22, x21, x6, w7
encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b
encrypt_block v1, w22, x21, x6, w7
encrypt_block v1, w3, x2, x6, w7
eor v2.16b, v2.16b, v1.16b
encrypt_block v2, w22, x21, x6, w7
encrypt_block v2, w3, x2, x6, w7
eor v3.16b, v3.16b, v2.16b
encrypt_block v3, w22, x21, x6, w7
st1 {v0.16b-v3.16b}, [x19], #64
encrypt_block v3, w3, x2, x6, w7
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v3.16b
st1 {v4.16b}, [x24] /* return iv */
cond_yield_neon .Lcbcencrestart
b .Lcbcencloop4x
.Lcbcenc1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lcbcencout
.Lcbcencloop:
ld1 {v0.16b}, [x20], #16 /* get next pt block */
ld1 {v0.16b}, [x1], #16 /* get next pt block */
eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
encrypt_block v4, w22, x21, x6, w7
st1 {v4.16b}, [x19], #16
subs w23, w23, #1
encrypt_block v4, w3, x2, x6, w7
st1 {v4.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcencloop
.Lcbcencout:
st1 {v4.16b}, [x24] /* return iv */
frame_pop
st1 {v4.16b}, [x5] /* return iv */
ret
AES_ENDPROC(aes_cbc_encrypt)
AES_ENTRY(aes_cbc_decrypt)
frame_push 6
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lcbcdecrestart:
ld1 {v7.16b}, [x24] /* get iv */
dec_prepare w22, x21, x6
ld1 {v7.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
.LcbcdecloopNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
bl aes_decrypt_block4x
sub x20, x20, #16
sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b
ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64
st1 {v7.16b}, [x24] /* return iv */
cond_yield_neon .Lcbcdecrestart
st1 {v0.16b-v3.16b}, [x0], #64
b .LcbcdecloopNx
.Lcbcdec1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lcbcdecout
.Lcbcdecloop:
ld1 {v1.16b}, [x20], #16 /* get next ct block */
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w22, x21, x6, w7
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x19], #16
subs w23, w23, #1
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
st1 {v7.16b}, [x24] /* return iv */
frame_pop
st1 {v7.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_cbc_decrypt)
/*
* aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
* int rounds, int bytes, u8 const iv[])
* aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
* int rounds, int bytes, u8 const iv[])
*/
AES_ENTRY(aes_cbc_cts_encrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
ld1 {v3.16b}, [x8]
ld1 {v4.16b}, [x9]
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
ld1 {v1.16b}, [x1]
ld1 {v5.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
eor v0.16b, v0.16b, v5.16b /* xor with iv */
tbl v1.16b, {v1.16b}, v4.16b
encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b
tbl v0.16b, {v0.16b}, v3.16b
encrypt_block v1, w3, x2, x6, w7
add x4, x0, x4
st1 {v0.16b}, [x4] /* overlapping stores */
st1 {v1.16b}, [x0]
ret
AES_ENDPROC(aes_cbc_cts_encrypt)
AES_ENTRY(aes_cbc_cts_decrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
ld1 {v3.16b}, [x8]
ld1 {v4.16b}, [x9]
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
ld1 {v1.16b}, [x1]
ld1 {v5.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
tbl v2.16b, {v1.16b}, v4.16b
decrypt_block v0, w3, x2, x6, w7
eor v2.16b, v2.16b, v0.16b
tbx v0.16b, {v1.16b}, v4.16b
tbl v2.16b, {v2.16b}, v3.16b
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v5.16b /* xor with iv */
add x4, x0, x4
st1 {v2.16b}, [x4] /* overlapping stores */
st1 {v0.16b}, [x0]
ret
AES_ENDPROC(aes_cbc_cts_decrypt)
.section ".rodata", "a"
.align 6
.Lcts_permute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.previous
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 ctr[])
*/
AES_ENTRY(aes_ctr_encrypt)
frame_push 6
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
.Lctrrestart:
enc_prepare w22, x21, x6
ld1 {v4.16b}, [x24]
enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lctr1x
cmn w6, #4 /* 32 bit overflow? */
bcs .Lctr1x
ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
dup v7.4s, w6
add w7, w6, #1
mov v0.16b, v4.16b
add v7.4s, v7.4s, v8.4s
add w8, w6, #2
mov v1.16b, v4.16b
rev32 v8.16b, v7.16b
add w9, w6, #3
mov v2.16b, v4.16b
rev w7, w7
mov v3.16b, v4.16b
mov v1.s[3], v8.s[0]
mov v2.s[3], v8.s[1]
mov v3.s[3], v8.s[2]
ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */
rev w8, w8
mov v1.s[3], w7
rev w9, w9
mov v2.s[3], w8
mov v3.s[3], w9
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b
ld1 {v5.16b}, [x20], #16 /* get 1 input block */
ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
st1 {v0.16b-v3.16b}, [x19], #64
st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #4
rev x7, x6
ins v4.d[1], x7
cbz w23, .Lctrout
st1 {v4.16b}, [x24] /* return next CTR value */
cond_yield_neon .Lctrrestart
cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lctrout
.Lctrloop:
mov v0.16b, v4.16b
encrypt_block v0, w22, x21, x8, w7
encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
@ -271,22 +306,22 @@ AES_ENTRY(aes_ctr_encrypt)
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
subs w23, w23, #1
subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x20], #16
ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x19], #16
st1 {v3.16b}, [x0], #16
bne .Lctrloop
.Lctrout:
st1 {v4.16b}, [x24] /* return next CTR value */
.Lctrret:
frame_pop
st1 {v4.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16
ret
.Lctrtailblock:
st1 {v0.16b}, [x19]
b .Lctrret
st1 {v0.16b}, [x0]
ldp x29, x30, [sp], #16
ret
.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
@ -296,7 +331,6 @@ AES_ENTRY(aes_ctr_encrypt)
ins v4.d[0], x7
b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)
.ltorg
/*
@ -306,150 +340,132 @@ AES_ENDPROC(aes_ctr_encrypt)
* int blocks, u8 const rk2[], u8 iv[], int first)
*/
.macro next_tweak, out, in, const, tmp
.macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63
and \tmp\().16b, \tmp\().16b, \const\().16b
and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
.Lxts_mul_x:
CPU_LE( .quad 1, 0x87 )
CPU_BE( .quad 0x87, 1 )
.macro xts_load_mask, tmp
movi xtsmask.2s, #0x1
movi \tmp\().2s, #0x87
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
.endm
AES_ENTRY(aes_xts_encrypt)
frame_push 6
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x6
ld1 {v4.16b}, [x24]
ld1 {v4.16b}, [x6]
xts_load_mask v8
cbz w7, .Lxtsencnotfirst
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
enc_switch_key w3, x2, x8
ldr q7, .Lxts_mul_x
b .LxtsencNx
.Lxtsencrestart:
ld1 {v4.16b}, [x24]
.Lxtsencnotfirst:
enc_prepare w22, x21, x8
enc_prepare w3, x2, x8
.LxtsencloopNx:
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
next_tweak v4, v4, v8
.LxtsencNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lxtsenc1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
next_tweak v5, v4, v7, v8
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8
next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8
next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w23, .Lxtsencout
st1 {v4.16b}, [x24]
cond_yield_neon .Lxtsencrestart
cbz w4, .Lxtsencout
xts_reload_mask v8
b .LxtsencloopNx
.Lxtsenc1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lxtsencout
.Lxtsencloop:
ld1 {v1.16b}, [x20], #16
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
encrypt_block v0, w22, x21, x8, w7
encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x19], #16
subs w23, w23, #1
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsencout
next_tweak v4, v4, v7, v8
next_tweak v4, v4, v8
b .Lxtsencloop
.Lxtsencout:
st1 {v4.16b}, [x24]
frame_pop
st1 {v4.16b}, [x6]
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_xts_encrypt)
AES_ENTRY(aes_xts_decrypt)
frame_push 6
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x6
ld1 {v4.16b}, [x24]
ld1 {v4.16b}, [x6]
xts_load_mask v8
cbz w7, .Lxtsdecnotfirst
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
dec_prepare w3, x2, x8
ldr q7, .Lxts_mul_x
b .LxtsdecNx
.Lxtsdecrestart:
ld1 {v4.16b}, [x24]
.Lxtsdecnotfirst:
dec_prepare w22, x21, x8
dec_prepare w3, x2, x8
.LxtsdecloopNx:
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
next_tweak v4, v4, v8
.LxtsdecNx:
subs w23, w23, #4
subs w4, w4, #4
bmi .Lxtsdec1x
ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
next_tweak v5, v4, v7, v8
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8
next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8
next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b
bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
eor v0.16b, v0.16b, v4.16b
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x19], #64
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w23, .Lxtsdecout
st1 {v4.16b}, [x24]
cond_yield_neon .Lxtsdecrestart
cbz w4, .Lxtsdecout
xts_reload_mask v8
b .LxtsdecloopNx
.Lxtsdec1x:
adds w23, w23, #4
adds w4, w4, #4
beq .Lxtsdecout
.Lxtsdecloop:
ld1 {v1.16b}, [x20], #16
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
decrypt_block v0, w22, x21, x8, w7
decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x19], #16
subs w23, w23, #1
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsdecout
next_tweak v4, v4, v7, v8
next_tweak v4, v4, v8
b .Lxtsdecloop
.Lxtsdecout:
st1 {v4.16b}, [x24]
frame_pop
st1 {v4.16b}, [x6]
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_xts_decrypt)

View File

@ -14,6 +14,12 @@
#define AES_ENTRY(func) ENTRY(neon_ ## func)
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
xtsmask .req v7
.macro xts_reload_mask, tmp
xts_load_mask \tmp
.endm
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
sshr \temp, \in, #7

View File

@ -1,287 +0,0 @@
/*
* Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
/* GPL HEADER START
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 only,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is included
* in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; If not, see http://www.gnu.org/licenses
*
* Please visit http://www.xyratex.com/contact if you need additional
* information or have any questions.
*
* GPL HEADER END
*/
/*
* Copyright 2012 Xyratex Technology Limited
*
* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
* calculation.
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
* at:
* http://www.intel.com/products/processor/manuals/
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2B: Instruction Set Reference, N-Z
*
* Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
* Alexander Boyko <Alexander_Boyko@xyratex.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.section ".rodata", "a"
.align 6
.cpu generic+crypto+crc
.Lcrc32_constants:
/*
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
* #define CONSTANT_R1 0x154442bd4LL
*
* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
* #define CONSTANT_R2 0x1c6e41596LL
*/
.octa 0x00000001c6e415960000000154442bd4
/*
* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
* #define CONSTANT_R3 0x1751997d0LL
*
* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
* #define CONSTANT_R4 0x0ccaa009eLL
*/
.octa 0x00000000ccaa009e00000001751997d0
/*
* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
* #define CONSTANT_R5 0x163cd6124LL
*/
.quad 0x0000000163cd6124
.quad 0x00000000FFFFFFFF
/*
* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
*
* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
* = 0x1F7011641LL
* #define CONSTANT_RU 0x1F7011641LL
*/
.octa 0x00000001F701164100000001DB710641
.Lcrc32c_constants:
.octa 0x000000009e4addf800000000740eef02
.octa 0x000000014cd00bd600000000f20c0dfe
.quad 0x00000000dd45aab8
.quad 0x00000000FFFFFFFF
.octa 0x00000000dea713f10000000105ec76f0
vCONSTANT .req v0
dCONSTANT .req d0
qCONSTANT .req q0
BUF .req x19
LEN .req x20
CRC .req x21
CONST .req x22
vzr .req v9
/**
* Calculate crc32
* BUF - buffer
* LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
* CRC - initial crc32
* return %eax crc32
* uint crc32_pmull_le(unsigned char const *buffer,
* size_t len, uint crc32)
*/
.text
ENTRY(crc32_pmull_le)
adr_l x3, .Lcrc32_constants
b 0f
ENTRY(crc32c_pmull_le)
adr_l x3, .Lcrc32c_constants
0: frame_push 4, 64
mov BUF, x0
mov LEN, x1
mov CRC, x2
mov CONST, x3
bic LEN, LEN, #15
ld1 {v1.16b-v4.16b}, [BUF], #0x40
movi vzr.16b, #0
fmov dCONSTANT, CRC
eor v1.16b, v1.16b, vCONSTANT.16b
sub LEN, LEN, #0x40
cmp LEN, #0x40
b.lt less_64
ldr qCONSTANT, [CONST]
loop_64: /* 64 bytes Full cache line folding */
sub LEN, LEN, #0x40
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull2 v6.1q, v2.2d, vCONSTANT.2d
pmull2 v7.1q, v3.2d, vCONSTANT.2d
pmull2 v8.1q, v4.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
pmull v2.1q, v2.1d, vCONSTANT.1d
pmull v3.1q, v3.1d, vCONSTANT.1d
pmull v4.1q, v4.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
ld1 {v5.16b}, [BUF], #0x10
eor v2.16b, v2.16b, v6.16b
ld1 {v6.16b}, [BUF], #0x10
eor v3.16b, v3.16b, v7.16b
ld1 {v7.16b}, [BUF], #0x10
eor v4.16b, v4.16b, v8.16b
ld1 {v8.16b}, [BUF], #0x10
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
eor v4.16b, v4.16b, v8.16b
cmp LEN, #0x40
b.lt less_64
if_will_cond_yield_neon
stp q1, q2, [sp, #.Lframe_local_offset]
stp q3, q4, [sp, #.Lframe_local_offset + 32]
do_cond_yield_neon
ldp q1, q2, [sp, #.Lframe_local_offset]
ldp q3, q4, [sp, #.Lframe_local_offset + 32]
ldr qCONSTANT, [CONST]
movi vzr.16b, #0
endif_yield_neon
b loop_64
less_64: /* Folding cache line into 128bit */
ldr qCONSTANT, [CONST, #16]
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v2.16b
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v3.16b
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v4.16b
cbz LEN, fold_64
loop_16: /* Folding rest buffer into 128bit */
subs LEN, LEN, #0x10
ld1 {v2.16b}, [BUF], #0x10
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v2.16b
b.ne loop_16
fold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
ext v2.16b, v1.16b, v1.16b, #8
pmull2 v2.1q, v2.2d, vCONSTANT.2d
ext v1.16b, v1.16b, vzr.16b, #8
eor v1.16b, v1.16b, v2.16b
/* final 32-bit fold */
ldr dCONSTANT, [CONST, #32]
ldr d3, [CONST, #40]
ext v2.16b, v1.16b, vzr.16b, #4
and v1.16b, v1.16b, v3.16b
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v2.16b
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
ldr qCONSTANT, [CONST, #48]
and v2.16b, v1.16b, v3.16b
ext v2.16b, vzr.16b, v2.16b, #8
pmull2 v2.1q, v2.2d, vCONSTANT.2d
and v2.16b, v2.16b, v3.16b
pmull v2.1q, v2.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v2.16b
mov w0, v1.s[1]
frame_pop
ret
ENDPROC(crc32_pmull_le)
ENDPROC(crc32c_pmull_le)
.macro __crc32, c
0: subs x2, x2, #16
b.mi 8f
ldp x3, x4, [x1], #16
CPU_BE( rev x3, x3 )
CPU_BE( rev x4, x4 )
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
b.ne 0b
ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
CPU_BE( rev x3, x3 )
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
CPU_BE( rev w3, w3 )
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
CPU_BE( rev16 w3, w3 )
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
crc32\c\()b w0, w0, w3
0: ret
.endm
.align 5
ENTRY(crc32_armv8_le)
__crc32
ENDPROC(crc32_armv8_le)
.align 5
ENTRY(crc32c_armv8_le)
__crc32 c
ENDPROC(crc32c_armv8_le)

View File

@ -1,244 +0,0 @@
/*
* Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/cpufeature.h>
#include <linux/crc32.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <crypto/internal/hash.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#define PMULL_MIN_LEN 64L /* minimum size of buffer
* for crc32_pmull_le_16 */
#define SCALE_F 16L /* size of NEON register */
asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = 0;
return 0;
}
static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = ~0;
return 0;
}
static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
unsigned int keylen)
{
u32 *mctx = crypto_shash_ctx(hash);
if (keylen != sizeof(u32)) {
crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
static int crc32_pmull_init(struct shash_desc *desc)
{
u32 *mctx = crypto_shash_ctx(desc->tfm);
u32 *crc = shash_desc_ctx(desc);
*crc = *mctx;
return 0;
}
static int crc32_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
*crc = crc32_armv8_le(*crc, data, length);
return 0;
}
static int crc32c_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
*crc = crc32c_armv8_le(*crc, data, length);
return 0;
}
static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
if ((u64)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
*crc = fallback_crc32(*crc, data, l);
data += l;
length -= l;
}
if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);
kernel_neon_begin();
*crc = crc32_pmull_le(data, l, *crc);
kernel_neon_end();
data += l;
length -= l;
}
if (length > 0)
*crc = fallback_crc32(*crc, data, length);
return 0;
}
static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
if ((u64)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
*crc = fallback_crc32c(*crc, data, l);
data += l;
length -= l;
}
if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);
kernel_neon_begin();
*crc = crc32c_pmull_le(data, l, *crc);
kernel_neon_end();
data += l;
length -= l;
}
if (length > 0) {
*crc = fallback_crc32c(*crc, data, length);
}
return 0;
}
static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
{
u32 *crc = shash_desc_ctx(desc);
put_unaligned_le32(*crc, out);
return 0;
}
static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
{
u32 *crc = shash_desc_ctx(desc);
put_unaligned_le32(~*crc, out);
return 0;
}
static struct shash_alg crc32_pmull_algs[] = { {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
.update = crc32_update,
.final = crc32_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
.base.cra_ctxsize = sizeof(u32),
.base.cra_init = crc32_pmull_cra_init,
.base.cra_name = "crc32",
.base.cra_driver_name = "crc32-arm64-ce",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_blocksize = 1,
.base.cra_module = THIS_MODULE,
}, {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
.update = crc32c_update,
.final = crc32c_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
.base.cra_ctxsize = sizeof(u32),
.base.cra_init = crc32c_pmull_cra_init,
.base.cra_name = "crc32c",
.base.cra_driver_name = "crc32c-arm64-ce",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_blocksize = 1,
.base.cra_module = THIS_MODULE,
} };
static int __init crc32_pmull_mod_init(void)
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
crc32_pmull_algs[0].update = crc32_pmull_update;
crc32_pmull_algs[1].update = crc32c_pmull_update;
if (elf_hwcap & HWCAP_CRC32) {
fallback_crc32 = crc32_armv8_le;
fallback_crc32c = crc32c_armv8_le;
} else {
fallback_crc32 = crc32_le;
fallback_crc32c = __crc32c_le;
}
} else if (!(elf_hwcap & HWCAP_CRC32)) {
return -ENODEV;
}
return crypto_register_shashes(crc32_pmull_algs,
ARRAY_SIZE(crc32_pmull_algs));
}
static void __exit crc32_pmull_mod_exit(void)
{
crypto_unregister_shashes(crc32_pmull_algs,
ARRAY_SIZE(crc32_pmull_algs));
}
static const struct cpu_feature crc32_cpu_feature[] = {
{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
module_init(crc32_pmull_mod_init);
module_exit(crc32_pmull_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");

View File

@ -80,7 +80,186 @@
vzr .req v13
ENTRY(crc_t10dif_pmull)
ad .req v14
bd .req v10
k00_16 .req v15
k32_48 .req v16
t3 .req v17
t4 .req v18
t5 .req v19
t6 .req v20
t7 .req v21
t8 .req v22
t9 .req v23
perm1 .req v24
perm2 .req v25
perm3 .req v26
perm4 .req v27
bd1 .req v28
bd2 .req v29
bd3 .req v30
bd4 .req v31
.macro __pmull_init_p64
.endm
.macro __pmull_pre_p64, bd
.endm
.macro __pmull_init_p8
// k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff
mov k32_48.h[2], k32_48.h[0]
ushr k00_16.2d, k32_48.2d, #32
// prepare the permutation vectors
mov_q x5, 0x080f0e0d0c0b0a09
movi perm4.8b, #8
dup perm1.2d, x5
eor perm1.16b, perm1.16b, perm4.16b
ushr perm2.2d, perm1.2d, #8
ushr perm3.2d, perm1.2d, #16
ushr perm4.2d, perm1.2d, #24
sli perm2.2d, perm1.2d, #56
sli perm3.2d, perm1.2d, #48
sli perm4.2d, perm1.2d, #40
.endm
.macro __pmull_pre_p8, bd
tbl bd1.16b, {\bd\().16b}, perm1.16b
tbl bd2.16b, {\bd\().16b}, perm2.16b
tbl bd3.16b, {\bd\().16b}, perm3.16b
tbl bd4.16b, {\bd\().16b}, perm4.16b
.endm
__pmull_p8_core:
.L__pmull_p8_core:
ext t4.8b, ad.8b, ad.8b, #1 // A1
ext t5.8b, ad.8b, ad.8b, #2 // A2
ext t6.8b, ad.8b, ad.8b, #3 // A3
pmull t4.8h, t4.8b, bd.8b // F = A1*B
pmull t8.8h, ad.8b, bd1.8b // E = A*B1
pmull t5.8h, t5.8b, bd.8b // H = A2*B
pmull t7.8h, ad.8b, bd2.8b // G = A*B2
pmull t6.8h, t6.8b, bd.8b // J = A3*B
pmull t9.8h, ad.8b, bd3.8b // I = A*B3
pmull t3.8h, ad.8b, bd4.8b // K = A*B4
b 0f
.L__pmull_p8_core2:
tbl t4.16b, {ad.16b}, perm1.16b // A1
tbl t5.16b, {ad.16b}, perm2.16b // A2
tbl t6.16b, {ad.16b}, perm3.16b // A3
pmull2 t4.8h, t4.16b, bd.16b // F = A1*B
pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
pmull2 t5.8h, t5.16b, bd.16b // H = A2*B
pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
pmull2 t6.8h, t6.16b, bd.16b // J = A3*B
pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
0: eor t4.16b, t4.16b, t8.16b // L = E + F
eor t5.16b, t5.16b, t7.16b // M = G + H
eor t6.16b, t6.16b, t9.16b // N = I + J
uzp1 t8.2d, t4.2d, t5.2d
uzp2 t4.2d, t4.2d, t5.2d
uzp1 t7.2d, t6.2d, t3.2d
uzp2 t6.2d, t6.2d, t3.2d
// t4 = (L) (P0 + P1) << 8
// t5 = (M) (P2 + P3) << 16
eor t8.16b, t8.16b, t4.16b
and t4.16b, t4.16b, k32_48.16b
// t6 = (N) (P4 + P5) << 24
// t7 = (K) (P6 + P7) << 32
eor t7.16b, t7.16b, t6.16b
and t6.16b, t6.16b, k00_16.16b
eor t8.16b, t8.16b, t4.16b
eor t7.16b, t7.16b, t6.16b
zip2 t5.2d, t8.2d, t4.2d
zip1 t4.2d, t8.2d, t4.2d
zip2 t3.2d, t7.2d, t6.2d
zip1 t6.2d, t7.2d, t6.2d
ext t4.16b, t4.16b, t4.16b, #15
ext t5.16b, t5.16b, t5.16b, #14
ext t6.16b, t6.16b, t6.16b, #13
ext t3.16b, t3.16b, t3.16b, #12
eor t4.16b, t4.16b, t5.16b
eor t6.16b, t6.16b, t3.16b
ret
ENDPROC(__pmull_p8_core)
.macro __pmull_p8, rq, ad, bd, i
.ifnc \bd, v10
.err
.endif
mov ad.16b, \ad\().16b
.ifb \i
pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B
.else
pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B
.endif
bl .L__pmull_p8_core\i
eor \rq\().16b, \rq\().16b, t4.16b
eor \rq\().16b, \rq\().16b, t6.16b
.endm
.macro fold64, p, reg1, reg2
ldp q11, q12, [arg2], #0x20
__pmull_\p v8, \reg1, v10, 2
__pmull_\p \reg1, \reg1, v10
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
__pmull_\p v9, \reg2, v10, 2
__pmull_\p \reg2, \reg2, v10
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
.macro fold16, p, reg, rk
__pmull_\p v8, \reg, v10
__pmull_\p \reg, \reg, v10, 2
.ifnb \rk
ldr_l q10, \rk, x8
__pmull_pre_\p v10
.endif
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, \reg\().16b
.endm
.macro __pmull_p64, rd, rn, rm, n
.ifb \n
pmull \rd\().1q, \rn\().1d, \rm\().1d
.else
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endif
.endm
.macro crc_t10dif_pmull, p
frame_push 3, 128
mov arg1_low32, w0
@ -89,6 +268,8 @@ ENTRY(crc_t10dif_pmull)
movi vzr.16b, #0 // init zero register
__pmull_init_\p
// adjust the 16-bit initial_crc value, scale it to 32 bits
lsl arg1_low32, arg1_low32, #16
@ -96,7 +277,7 @@ ENTRY(crc_t10dif_pmull)
cmp arg3, #256
// for sizes less than 128, we can't fold 64B at a time...
b.lt _less_than_128
b.lt .L_less_than_128_\@
// load the initial crc value
// crc value does not need to be byte-reflected, but it needs
@ -137,6 +318,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
// type of pmull instruction
// will determine which constant to use
__pmull_pre_\p v10
//
// we subtract 256 instead of 128 to save one instruction from the loop
@ -147,41 +329,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// buffer. The _fold_64_B_loop will fold 64B at a time
// until we have 64+y Bytes of buffer
// fold 64B at a time. This section of the code folds 4 vector
// registers in parallel
_fold_64_B_loop:
.L_fold_64_B_loop_\@:
.macro fold64, reg1, reg2
ldp q11, q12, [arg2], #0x20
pmull2 v8.1q, \reg1\().2d, v10.2d
pmull \reg1\().1q, \reg1\().1d, v10.1d
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
pmull2 v9.1q, \reg2\().2d, v10.2d
pmull \reg2\().1q, \reg2\().1d, v10.1d
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
fold64 v0, v1
fold64 v2, v3
fold64 v4, v5
fold64 v6, v7
fold64 \p, v0, v1
fold64 \p, v2, v3
fold64 \p, v4, v5
fold64 \p, v6, v7
subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold
b.lt _fold_64_B_end
b.lt .L_fold_64_B_end_\@
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
@ -195,11 +355,13 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
ldr_l q10, rk3, x8
movi vzr.16b, #0 // init zero register
__pmull_init_\p
__pmull_pre_\p v10
endif_yield_neon
b _fold_64_B_loop
b .L_fold_64_B_loop_\@
_fold_64_B_end:
.L_fold_64_B_end_\@:
// at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3
@ -208,38 +370,29 @@ _fold_64_B_end:
// constants
ldr_l q10, rk9, x8
__pmull_pre_\p v10
.macro fold16, reg, rk
pmull v8.1q, \reg\().1d, v10.1d
pmull2 \reg\().1q, \reg\().2d, v10.2d
.ifnb \rk
ldr_l q10, \rk, x8
.endif
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, \reg\().16b
.endm
fold16 v0, rk11
fold16 v1, rk13
fold16 v2, rk15
fold16 v3, rk17
fold16 v4, rk19
fold16 v5, rk1
fold16 v6
fold16 \p, v0, rk11
fold16 \p, v1, rk13
fold16 \p, v2, rk15
fold16 \p, v3, rk17
fold16 \p, v4, rk19
fold16 \p, v5, rk1
fold16 \p, v6
// instead of 64, we add 48 to the loop counter to save 1 instruction
// from the loop instead of a cmp instruction, we use the negative
// flag with the jl instruction
adds arg3, arg3, #(128-16)
b.lt _final_reduction_for_128
b.lt .L_final_reduction_for_128_\@
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
// continue folding 16B at a time
_16B_reduction_loop:
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
.L_16B_reduction_loop_\@:
__pmull_\p v8, v7, v10
__pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
ldr q0, [arg2], #16
@ -251,22 +404,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// instead of a cmp instruction, we utilize the flags with the
// jge instruction equivalent of: cmp arg3, 16-16
// check if there is any more 16B in the buffer to be able to fold
b.ge _16B_reduction_loop
b.ge .L_16B_reduction_loop_\@
// now we have 16+z bytes left to reduce, where 0<= z < 16.
// first, we reduce the data in the xmm7 register
_final_reduction_for_128:
.L_final_reduction_for_128_\@:
// check if any more data to fold. If not, compute the CRC of
// the final 128 bits
adds arg3, arg3, #16
b.eq _128_done
b.eq .L_128_done_\@
// here we are getting data that is less than 16 bytes.
// since we know that there was data before the pointer, we can
// offset the input pointer before the actual point, to receive
// exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_regs:
.L_get_last_two_regs_\@:
add arg2, arg2, arg3
ldr q1, [arg2, #-16]
CPU_LE( rev64 v1.16b, v1.16b )
@ -291,47 +444,48 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
bsl v0.16b, v2.16b, v1.16b
// fold 16 Bytes
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
__pmull_\p v8, v7, v10
__pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, v0.16b
_128_done:
.L_128_done_\@:
// compute crc of a 128-bit value
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
__pmull_pre_\p v10
// 64b fold
ext v0.16b, vzr.16b, v7.16b, #8
mov v7.d[0], v7.d[1]
pmull v7.1q, v7.1d, v10.1d
__pmull_\p v7, v7, v10
eor v7.16b, v7.16b, v0.16b
// 32b fold
ext v0.16b, v7.16b, vzr.16b, #4
mov v7.s[3], vzr.s[0]
pmull2 v0.1q, v0.2d, v10.2d
__pmull_\p v0, v0, v10, 2
eor v7.16b, v7.16b, v0.16b
// barrett reduction
_barrett:
ldr_l q10, rk7, x8
__pmull_pre_\p v10
mov v0.d[0], v7.d[1]
pmull v0.1q, v0.1d, v10.1d
__pmull_\p v0, v0, v10
ext v0.16b, vzr.16b, v0.16b, #12
pmull2 v0.1q, v0.2d, v10.2d
__pmull_\p v0, v0, v10, 2
ext v0.16b, vzr.16b, v0.16b, #12
eor v7.16b, v7.16b, v0.16b
mov w0, v7.s[1]
_cleanup:
.L_cleanup_\@:
// scale the result back to 16 bits
lsr x0, x0, #16
frame_pop
ret
_less_than_128:
cbz arg3, _cleanup
.L_less_than_128_\@:
cbz arg3, .L_cleanup_\@
movi v0.16b, #0
mov v0.s[3], arg1_low32 // get the initial crc value
@ -342,20 +496,21 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
cmp arg3, #16
b.eq _128_done // exactly 16 left
b.lt _less_than_16_left
b.eq .L_128_done_\@ // exactly 16 left
b.lt .L_less_than_16_left_\@
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
__pmull_pre_\p v10
// update the counter. subtract 32 instead of 16 to save one
// instruction from the loop
subs arg3, arg3, #32
b.ge _16B_reduction_loop
b.ge .L_16B_reduction_loop_\@
add arg3, arg3, #16
b _get_last_two_regs
b .L_get_last_two_regs_\@
_less_than_16_left:
.L_less_than_16_left_\@:
// shl r9, 4
adr_l x0, tbl_shf_table + 16
sub x0, x0, arg3
@ -363,8 +518,17 @@ _less_than_16_left:
movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b
b _128_done
ENDPROC(crc_t10dif_pmull)
b .L_128_done_\@
.endm
ENTRY(crc_t10dif_pmull_p8)
crc_t10dif_pmull p8
ENDPROC(crc_t10dif_pmull_p8)
.align 5
ENTRY(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
ENDPROC(crc_t10dif_pmull_p64)
// precomputed constants
// these constants are precomputed from the poly:

View File

@ -22,7 +22,10 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
static int crct10dif_init(struct shash_desc *desc)
{
@ -85,6 +88,11 @@ static struct shash_alg crc_t10dif_alg = {
static int __init crc_t10dif_mod_init(void)
{
if (elf_hwcap & HWCAP_PMULL)
crc_t10dif_pmull = crc_t10dif_pmull_p64;
else
crc_t10dif_pmull = crc_t10dif_pmull_p8;
return crypto_register_shash(&crc_t10dif_alg);
}
@ -93,8 +101,10 @@ static void __exit crc_t10dif_mod_exit(void)
crypto_unregister_shash(&crc_t10dif_alg);
}
module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("crct10dif");
MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");

View File

@ -1,352 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
*
* Copyright (c) 2018 Google, Inc
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
.text
// arguments
ROUND_KEYS .req x0 // const {u64,u32} *round_keys
NROUNDS .req w1 // int nrounds
NROUNDS_X .req x1
DST .req x2 // void *dst
SRC .req x3 // const void *src
NBYTES .req w4 // unsigned int nbytes
TWEAK .req x5 // void *tweak
// registers which hold the data being encrypted/decrypted
// (underscores avoid a naming collision with ARM64 registers x0-x3)
X_0 .req v0
Y_0 .req v1
X_1 .req v2
Y_1 .req v3
X_2 .req v4
Y_2 .req v5
X_3 .req v6
Y_3 .req v7
// the round key, duplicated in all lanes
ROUND_KEY .req v8
// index vector for tbl-based 8-bit rotates
ROTATE_TABLE .req v9
ROTATE_TABLE_Q .req q9
// temporary registers
TMP0 .req v10
TMP1 .req v11
TMP2 .req v12
TMP3 .req v13
// multiplication table for updating XTS tweaks
GFMUL_TABLE .req v14
GFMUL_TABLE_Q .req q14
// next XTS tweak value(s)
TWEAKV_NEXT .req v15
// XTS tweaks for the blocks currently being encrypted/decrypted
TWEAKV0 .req v16
TWEAKV1 .req v17
TWEAKV2 .req v18
TWEAKV3 .req v19
TWEAKV4 .req v20
TWEAKV5 .req v21
TWEAKV6 .req v22
TWEAKV7 .req v23
.align 4
.Lror64_8_table:
.octa 0x080f0e0d0c0b0a090007060504030201
.Lror32_8_table:
.octa 0x0c0f0e0d080b0a090407060500030201
.Lrol64_8_table:
.octa 0x0e0d0c0b0a09080f0605040302010007
.Lrol32_8_table:
.octa 0x0e0d0c0f0a09080b0605040702010003
.Lgf128mul_table:
.octa 0x00000000000000870000000000000001
.Lgf64mul_table:
.octa 0x0000000000000000000000002d361b00
/*
* _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
*
* Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
* Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
* of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
* 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
*/
.macro _speck_round_128bytes n, lanes
// x = ror(x, 8)
tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
// x += y
add X_0.\lanes, X_0.\lanes, Y_0.\lanes
add X_1.\lanes, X_1.\lanes, Y_1.\lanes
add X_2.\lanes, X_2.\lanes, Y_2.\lanes
add X_3.\lanes, X_3.\lanes, Y_3.\lanes
// x ^= k
eor X_0.16b, X_0.16b, ROUND_KEY.16b
eor X_1.16b, X_1.16b, ROUND_KEY.16b
eor X_2.16b, X_2.16b, ROUND_KEY.16b
eor X_3.16b, X_3.16b, ROUND_KEY.16b
// y = rol(y, 3)
shl TMP0.\lanes, Y_0.\lanes, #3
shl TMP1.\lanes, Y_1.\lanes, #3
shl TMP2.\lanes, Y_2.\lanes, #3
shl TMP3.\lanes, Y_3.\lanes, #3
sri TMP0.\lanes, Y_0.\lanes, #(\n - 3)
sri TMP1.\lanes, Y_1.\lanes, #(\n - 3)
sri TMP2.\lanes, Y_2.\lanes, #(\n - 3)
sri TMP3.\lanes, Y_3.\lanes, #(\n - 3)
// y ^= x
eor Y_0.16b, TMP0.16b, X_0.16b
eor Y_1.16b, TMP1.16b, X_1.16b
eor Y_2.16b, TMP2.16b, X_2.16b
eor Y_3.16b, TMP3.16b, X_3.16b
.endm
/*
* _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
*
* This is the inverse of _speck_round_128bytes().
*/
.macro _speck_unround_128bytes n, lanes
// y ^= x
eor TMP0.16b, Y_0.16b, X_0.16b
eor TMP1.16b, Y_1.16b, X_1.16b
eor TMP2.16b, Y_2.16b, X_2.16b
eor TMP3.16b, Y_3.16b, X_3.16b
// y = ror(y, 3)
ushr Y_0.\lanes, TMP0.\lanes, #3
ushr Y_1.\lanes, TMP1.\lanes, #3
ushr Y_2.\lanes, TMP2.\lanes, #3
ushr Y_3.\lanes, TMP3.\lanes, #3
sli Y_0.\lanes, TMP0.\lanes, #(\n - 3)
sli Y_1.\lanes, TMP1.\lanes, #(\n - 3)
sli Y_2.\lanes, TMP2.\lanes, #(\n - 3)
sli Y_3.\lanes, TMP3.\lanes, #(\n - 3)
// x ^= k
eor X_0.16b, X_0.16b, ROUND_KEY.16b
eor X_1.16b, X_1.16b, ROUND_KEY.16b
eor X_2.16b, X_2.16b, ROUND_KEY.16b
eor X_3.16b, X_3.16b, ROUND_KEY.16b
// x -= y
sub X_0.\lanes, X_0.\lanes, Y_0.\lanes
sub X_1.\lanes, X_1.\lanes, Y_1.\lanes
sub X_2.\lanes, X_2.\lanes, Y_2.\lanes
sub X_3.\lanes, X_3.\lanes, Y_3.\lanes
// x = rol(x, 8)
tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
.endm
.macro _next_xts_tweak next, cur, tmp, n
.if \n == 64
/*
* Calculate the next tweak by multiplying the current one by x,
* modulo p(x) = x^128 + x^7 + x^2 + x + 1.
*/
sshr \tmp\().2d, \cur\().2d, #63
and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
shl \next\().2d, \cur\().2d, #1
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \next\().16b, \next\().16b, \tmp\().16b
.else
/*
* Calculate the next two tweaks by multiplying the current ones by x^2,
* modulo p(x) = x^64 + x^4 + x^3 + x + 1.
*/
ushr \tmp\().2d, \cur\().2d, #62
shl \next\().2d, \cur\().2d, #2
tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
eor \next\().16b, \next\().16b, \tmp\().16b
.endif
.endm
/*
* _speck_xts_crypt() - Speck-XTS encryption/decryption
*
* Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
* using Speck-XTS, specifically the variant with a block size of '2n' and round
* count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
* the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
* nonzero multiple of 128.
*/
.macro _speck_xts_crypt n, lanes, decrypting
/*
* If decrypting, modify the ROUND_KEYS parameter to point to the last
* round key rather than the first, since for decryption the round keys
* are used in reverse order.
*/
.if \decrypting
mov NROUNDS, NROUNDS /* zero the high 32 bits */
.if \n == 64
add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
sub ROUND_KEYS, ROUND_KEYS, #8
.else
add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
sub ROUND_KEYS, ROUND_KEYS, #4
.endif
.endif
// Load the index vector for tbl-based 8-bit rotates
.if \decrypting
ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table
.else
ldr ROTATE_TABLE_Q, .Lror\n\()_8_table
.endif
// One-time XTS preparation
.if \n == 64
// Load first tweak
ld1 {TWEAKV0.16b}, [TWEAK]
// Load GF(2^128) multiplication table
ldr GFMUL_TABLE_Q, .Lgf128mul_table
.else
// Load first tweak
ld1 {TWEAKV0.8b}, [TWEAK]
// Load GF(2^64) multiplication table
ldr GFMUL_TABLE_Q, .Lgf64mul_table
// Calculate second tweak, packing it together with the first
ushr TMP0.2d, TWEAKV0.2d, #63
shl TMP1.2d, TWEAKV0.2d, #1
tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
eor TMP0.8b, TMP0.8b, TMP1.8b
mov TWEAKV0.d[1], TMP0.d[0]
.endif
.Lnext_128bytes_\@:
// Calculate XTS tweaks for next 128 bytes
_next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
_next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
_next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
_next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
_next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
_next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
_next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
_next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
// Load the next source blocks into {X,Y}[0-3]
ld1 {X_0.16b-Y_1.16b}, [SRC], #64
ld1 {X_2.16b-Y_3.16b}, [SRC], #64
// XOR the source blocks with their XTS tweaks
eor TMP0.16b, X_0.16b, TWEAKV0.16b
eor Y_0.16b, Y_0.16b, TWEAKV1.16b
eor TMP1.16b, X_1.16b, TWEAKV2.16b
eor Y_1.16b, Y_1.16b, TWEAKV3.16b
eor TMP2.16b, X_2.16b, TWEAKV4.16b
eor Y_2.16b, Y_2.16b, TWEAKV5.16b
eor TMP3.16b, X_3.16b, TWEAKV6.16b
eor Y_3.16b, Y_3.16b, TWEAKV7.16b
/*
* De-interleave the 'x' and 'y' elements of each block, i.e. make it so
* that the X[0-3] registers contain only the second halves of blocks,
* and the Y[0-3] registers contain only the first halves of blocks.
* (Speck uses the order (y, x) rather than the more intuitive (x, y).)
*/
uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes
uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes
uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes
uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes
uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
// Do the cipher rounds
mov x6, ROUND_KEYS
mov w7, NROUNDS
.Lnext_round_\@:
.if \decrypting
ld1r {ROUND_KEY.\lanes}, [x6]
sub x6, x6, #( \n / 8 )
_speck_unround_128bytes \n, \lanes
.else
ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
_speck_round_128bytes \n, \lanes
.endif
subs w7, w7, #1
bne .Lnext_round_\@
// Re-interleave the 'x' and 'y' elements of each block
zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes
zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes
zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes
zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes
zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes
zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes
zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes
zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes
// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
eor X_0.16b, TMP0.16b, TWEAKV0.16b
eor Y_0.16b, Y_0.16b, TWEAKV1.16b
eor X_1.16b, TMP1.16b, TWEAKV2.16b
eor Y_1.16b, Y_1.16b, TWEAKV3.16b
eor X_2.16b, TMP2.16b, TWEAKV4.16b
eor Y_2.16b, Y_2.16b, TWEAKV5.16b
eor X_3.16b, TMP3.16b, TWEAKV6.16b
eor Y_3.16b, Y_3.16b, TWEAKV7.16b
mov TWEAKV0.16b, TWEAKV_NEXT.16b
// Store the ciphertext in the destination buffer
st1 {X_0.16b-Y_1.16b}, [DST], #64
st1 {X_2.16b-Y_3.16b}, [DST], #64
// Continue if there are more 128-byte chunks remaining
subs NBYTES, NBYTES, #128
bne .Lnext_128bytes_\@
// Store the next tweak and return
.if \n == 64
st1 {TWEAKV_NEXT.16b}, [TWEAK]
.else
st1 {TWEAKV_NEXT.8b}, [TWEAK]
.endif
ret
.endm
ENTRY(speck128_xts_encrypt_neon)
_speck_xts_crypt n=64, lanes=2d, decrypting=0
ENDPROC(speck128_xts_encrypt_neon)
ENTRY(speck128_xts_decrypt_neon)
_speck_xts_crypt n=64, lanes=2d, decrypting=1
ENDPROC(speck128_xts_decrypt_neon)
ENTRY(speck64_xts_encrypt_neon)
_speck_xts_crypt n=32, lanes=4s, decrypting=0
ENDPROC(speck64_xts_encrypt_neon)
ENTRY(speck64_xts_decrypt_neon)
_speck_xts_crypt n=32, lanes=4s, decrypting=1
ENDPROC(speck64_xts_decrypt_neon)

View File

@ -1,282 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
* (64-bit version; based on the 32-bit version)
*
* Copyright (c) 2018 Google, Inc
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/algapi.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/skcipher.h>
#include <crypto/speck.h>
#include <crypto/xts.h>
#include <linux/kernel.h>
#include <linux/module.h>
/* The assembly functions only handle multiples of 128 bytes */
#define SPECK_NEON_CHUNK_SIZE 128
/* Speck128 */
struct speck128_xts_tfm_ctx {
struct speck128_tfm_ctx main_key;
struct speck128_tfm_ctx tweak_key;
};
asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck128_xts_crypt(struct skcipher_request *req,
speck128_crypt_one_t crypt_one,
speck128_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
le128 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
le128_xor((le128 *)dst, (const le128 *)src, &tweak);
(*crypt_one)(&ctx->main_key, dst, dst);
le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
gf128mul_x_ble(&tweak, &tweak);
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck128_xts_encrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_encrypt,
speck128_xts_encrypt_neon);
}
static int speck128_xts_decrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_decrypt,
speck128_xts_decrypt_neon);
}
static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
}
/* Speck64 */
struct speck64_xts_tfm_ctx {
struct speck64_tfm_ctx main_key;
struct speck64_tfm_ctx tweak_key;
};
asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
speck64_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
__le64 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
*(__le64 *)dst = *(__le64 *)src ^ tweak;
(*crypt_one)(&ctx->main_key, dst, dst);
*(__le64 *)dst ^= tweak;
tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
((tweak & cpu_to_le64(1ULL << 63)) ?
0x1B : 0));
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck64_xts_encrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_encrypt,
speck64_xts_encrypt_neon);
}
static int speck64_xts_decrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_decrypt,
speck64_xts_decrypt_neon);
}
static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
}
static struct skcipher_alg speck_algs[] = {
{
.base.cra_name = "xts(speck128)",
.base.cra_driver_name = "xts-speck128-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK128_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK128_128_KEY_SIZE,
.max_keysize = 2 * SPECK128_256_KEY_SIZE,
.ivsize = SPECK128_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck128_xts_setkey,
.encrypt = speck128_xts_encrypt,
.decrypt = speck128_xts_decrypt,
}, {
.base.cra_name = "xts(speck64)",
.base.cra_driver_name = "xts-speck64-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK64_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK64_96_KEY_SIZE,
.max_keysize = 2 * SPECK64_128_KEY_SIZE,
.ivsize = SPECK64_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck64_xts_setkey,
.encrypt = speck64_xts_encrypt,
.decrypt = speck64_xts_decrypt,
}
};
static int __init speck_neon_module_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
static void __exit speck_neon_module_exit(void)
{
crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
module_init(speck_neon_module_init);
module_exit(speck_neon_module_exit);
MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("xts(speck128)");
MODULE_ALIAS_CRYPTO("xts-speck128-neon");
MODULE_ALIAS_CRYPTO("xts(speck64)");
MODULE_ALIAS_CRYPTO("xts-speck64-neon");

View File

@ -621,7 +621,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -657,7 +656,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -578,7 +578,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -614,7 +613,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -599,7 +599,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -635,7 +634,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -570,7 +570,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -606,7 +605,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -580,7 +580,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -616,7 +615,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -602,7 +602,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -638,7 +637,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -684,7 +684,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -720,7 +719,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -570,7 +570,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -606,7 +605,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -570,7 +570,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -606,7 +605,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -593,7 +593,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -629,7 +628,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -571,7 +571,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -607,7 +606,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -572,7 +572,6 @@ CONFIG_CRYPTO_ECDH=m
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_USER=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_AEGIS128=m
@ -608,7 +607,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m

View File

@ -668,7 +668,6 @@ CONFIG_CRYPTO_USER=m
# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
CONFIG_CRYPTO_PCRYPT=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_LRW=m

View File

@ -610,7 +610,6 @@ CONFIG_CRYPTO_USER=m
# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
CONFIG_CRYPTO_PCRYPT=m
CONFIG_CRYPTO_CRYPTD=m
CONFIG_CRYPTO_MCRYPTD=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_LRW=m

View File

@ -44,7 +44,7 @@ struct s390_aes_ctx {
int key_len;
unsigned long fc;
union {
struct crypto_skcipher *blk;
struct crypto_sync_skcipher *blk;
struct crypto_cipher *cip;
} fallback;
};
@ -54,7 +54,7 @@ struct s390_xts_ctx {
u8 pcc_key[32];
int key_len;
unsigned long fc;
struct crypto_skcipher *fallback;
struct crypto_sync_skcipher *fallback;
};
struct gcm_sg_walk {
@ -184,14 +184,15 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key,
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
unsigned int ret;
crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags &
crypto_sync_skcipher_clear_flags(sctx->fallback.blk,
CRYPTO_TFM_REQ_MASK);
crypto_sync_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags &
CRYPTO_TFM_REQ_MASK);
ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len);
ret = crypto_sync_skcipher_setkey(sctx->fallback.blk, key, len);
tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) &
tfm->crt_flags |= crypto_sync_skcipher_get_flags(sctx->fallback.blk) &
CRYPTO_TFM_RES_MASK;
return ret;
@ -204,9 +205,9 @@ static int fallback_blk_dec(struct blkcipher_desc *desc,
unsigned int ret;
struct crypto_blkcipher *tfm = desc->tfm;
struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
SYNC_SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
skcipher_request_set_tfm(req, sctx->fallback.blk);
skcipher_request_set_sync_tfm(req, sctx->fallback.blk);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
@ -223,9 +224,9 @@ static int fallback_blk_enc(struct blkcipher_desc *desc,
unsigned int ret;
struct crypto_blkcipher *tfm = desc->tfm;
struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
SYNC_SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
skcipher_request_set_tfm(req, sctx->fallback.blk);
skcipher_request_set_sync_tfm(req, sctx->fallback.blk);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
@ -306,8 +307,7 @@ static int fallback_init_blk(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
sctx->fallback.blk = crypto_alloc_skcipher(name, 0,
CRYPTO_ALG_ASYNC |
sctx->fallback.blk = crypto_alloc_sync_skcipher(name, 0,
CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(sctx->fallback.blk)) {
@ -323,7 +323,7 @@ static void fallback_exit_blk(struct crypto_tfm *tfm)
{
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
crypto_free_skcipher(sctx->fallback.blk);
crypto_free_sync_skcipher(sctx->fallback.blk);
}
static struct crypto_alg ecb_aes_alg = {
@ -453,14 +453,15 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key,
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
unsigned int ret;
crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags &
crypto_sync_skcipher_clear_flags(xts_ctx->fallback,
CRYPTO_TFM_REQ_MASK);
crypto_sync_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags &
CRYPTO_TFM_REQ_MASK);
ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
ret = crypto_sync_skcipher_setkey(xts_ctx->fallback, key, len);
tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) &
tfm->crt_flags |= crypto_sync_skcipher_get_flags(xts_ctx->fallback) &
CRYPTO_TFM_RES_MASK;
return ret;
@ -472,10 +473,10 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc,
{
struct crypto_blkcipher *tfm = desc->tfm;
struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
SYNC_SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
skcipher_request_set_tfm(req, xts_ctx->fallback);
skcipher_request_set_sync_tfm(req, xts_ctx->fallback);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
@ -491,10 +492,10 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc,
{
struct crypto_blkcipher *tfm = desc->tfm;
struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
SYNC_SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
skcipher_request_set_tfm(req, xts_ctx->fallback);
skcipher_request_set_sync_tfm(req, xts_ctx->fallback);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
@ -611,8 +612,7 @@ static int xts_fallback_init(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
xts_ctx->fallback = crypto_alloc_skcipher(name, 0,
CRYPTO_ALG_ASYNC |
xts_ctx->fallback = crypto_alloc_sync_skcipher(name, 0,
CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(xts_ctx->fallback)) {
@ -627,7 +627,7 @@ static void xts_fallback_exit(struct crypto_tfm *tfm)
{
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
crypto_free_skcipher(xts_ctx->fallback);
crypto_free_sync_skcipher(xts_ctx->fallback);
}
static struct crypto_alg xts_aes_alg = {

View File

@ -221,7 +221,6 @@ CONFIG_CRYPTO_SALSA20=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_SM4=m
CONFIG_CRYPTO_SPECK=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_DEFLATE=m

View File

@ -60,9 +60,6 @@ endif
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
endif
@ -106,7 +103,7 @@ ifeq ($(avx2_supported),yes)
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o

View File

@ -102,9 +102,6 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
int crypto_fpu_init(void);
void crypto_fpu_exit(void);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
@ -817,7 +814,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length < PAGE_SIZE)) {
req->src->offset + req->src->length <= PAGE_SIZE)) {
scatterwalk_start(&assoc_sg_walk, req->src);
assoc = scatterwalk_map(&assoc_sg_walk);
} else {
@ -1253,22 +1250,6 @@ static struct skcipher_alg aesni_skciphers[] = {
static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
static struct {
const char *algname;
const char *drvname;
const char *basename;
struct simd_skcipher_alg *simd;
} aesni_simd_skciphers2[] = {
#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \
IS_BUILTIN(CONFIG_CRYPTO_PCBC)
{
.algname = "pcbc(aes)",
.drvname = "pcbc-aes-aesni",
.basename = "fpu(pcbc(__aes-aesni))",
},
#endif
};
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
@ -1422,10 +1403,6 @@ static void aesni_free_simds(void)
for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
aesni_simd_skciphers[i]; i++)
simd_skcipher_free(aesni_simd_skciphers[i]);
for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++)
if (aesni_simd_skciphers2[i].simd)
simd_skcipher_free(aesni_simd_skciphers2[i].simd);
}
static int __init aesni_init(void)
@ -1469,13 +1446,9 @@ static int __init aesni_init(void)
#endif
#endif
err = crypto_fpu_init();
if (err)
return err;
err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
if (err)
goto fpu_exit;
return err;
err = crypto_register_skciphers(aesni_skciphers,
ARRAY_SIZE(aesni_skciphers));
@ -1499,18 +1472,6 @@ static int __init aesni_init(void)
aesni_simd_skciphers[i] = simd;
}
for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) {
algname = aesni_simd_skciphers2[i].algname;
drvname = aesni_simd_skciphers2[i].drvname;
basename = aesni_simd_skciphers2[i].basename;
simd = simd_skcipher_create_compat(algname, drvname, basename);
err = PTR_ERR(simd);
if (IS_ERR(simd))
continue;
aesni_simd_skciphers2[i].simd = simd;
}
return 0;
unregister_simds:
@ -1521,8 +1482,6 @@ static int __init aesni_init(void)
ARRAY_SIZE(aesni_skciphers));
unregister_algs:
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
fpu_exit:
crypto_fpu_exit();
return err;
}
@ -1533,8 +1492,6 @@ static void __exit aesni_exit(void)
crypto_unregister_skciphers(aesni_skciphers,
ARRAY_SIZE(aesni_skciphers));
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
crypto_fpu_exit();
}
late_initcall(aesni_init);

View File

@ -1,207 +0,0 @@
/*
* FPU: Wrapper for blkcipher touching fpu
*
* Copyright (c) Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <crypto/internal/skcipher.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/fpu/api.h>
struct crypto_fpu_ctx {
struct crypto_skcipher *child;
};
static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key,
unsigned int keylen)
{
struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent);
struct crypto_skcipher *child = ctx->child;
int err;
crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) &
CRYPTO_TFM_REQ_MASK);
err = crypto_skcipher_setkey(child, key, keylen);
crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) &
CRYPTO_TFM_RES_MASK);
return err;
}
static int crypto_fpu_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_skcipher *child = ctx->child;
SKCIPHER_REQUEST_ON_STACK(subreq, child);
int err;
skcipher_request_set_tfm(subreq, child);
skcipher_request_set_callback(subreq, 0, NULL, NULL);
skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
req->iv);
kernel_fpu_begin();
err = crypto_skcipher_encrypt(subreq);
kernel_fpu_end();
skcipher_request_zero(subreq);
return err;
}
static int crypto_fpu_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_skcipher *child = ctx->child;
SKCIPHER_REQUEST_ON_STACK(subreq, child);
int err;
skcipher_request_set_tfm(subreq, child);
skcipher_request_set_callback(subreq, 0, NULL, NULL);
skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
req->iv);
kernel_fpu_begin();
err = crypto_skcipher_decrypt(subreq);
kernel_fpu_end();
skcipher_request_zero(subreq);
return err;
}
static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm)
{
struct skcipher_instance *inst = skcipher_alg_instance(tfm);
struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_skcipher_spawn *spawn;
struct crypto_skcipher *cipher;
spawn = skcipher_instance_ctx(inst);
cipher = crypto_spawn_skcipher(spawn);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
ctx->child = cipher;
return 0;
}
static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm)
{
struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
crypto_free_skcipher(ctx->child);
}
static void crypto_fpu_free(struct skcipher_instance *inst)
{
crypto_drop_skcipher(skcipher_instance_ctx(inst));
kfree(inst);
}
static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb)
{
struct crypto_skcipher_spawn *spawn;
struct skcipher_instance *inst;
struct crypto_attr_type *algt;
struct skcipher_alg *alg;
const char *cipher_name;
int err;
algt = crypto_get_attr_type(tb);
if (IS_ERR(algt))
return PTR_ERR(algt);
if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) &
algt->mask)
return -EINVAL;
if (!(algt->mask & CRYPTO_ALG_INTERNAL))
return -EINVAL;
cipher_name = crypto_attr_alg_name(tb[1]);
if (IS_ERR(cipher_name))
return PTR_ERR(cipher_name);
inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
if (!inst)
return -ENOMEM;
spawn = skcipher_instance_ctx(inst);
crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst));
err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL,
CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC);
if (err)
goto out_free_inst;
alg = crypto_skcipher_spawn_alg(spawn);
err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu",
&alg->base);
if (err)
goto out_drop_skcipher;
inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL;
inst->alg.base.cra_priority = alg->base.cra_priority;
inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg);
inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg);
inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
inst->alg.init = crypto_fpu_init_tfm;
inst->alg.exit = crypto_fpu_exit_tfm;
inst->alg.setkey = crypto_fpu_setkey;
inst->alg.encrypt = crypto_fpu_encrypt;
inst->alg.decrypt = crypto_fpu_decrypt;
inst->free = crypto_fpu_free;
err = skcipher_register_instance(tmpl, inst);
if (err)
goto out_drop_skcipher;
out:
return err;
out_drop_skcipher:
crypto_drop_skcipher(spawn);
out_free_inst:
kfree(inst);
goto out;
}
static struct crypto_template crypto_fpu_tmpl = {
.name = "fpu",
.create = crypto_fpu_create,
.module = THIS_MODULE,
};
int __init crypto_fpu_init(void)
{
return crypto_register_template(&crypto_fpu_tmpl);
}
void crypto_fpu_exit(void)
{
crypto_unregister_template(&crypto_fpu_tmpl);
}
MODULE_ALIAS_CRYPTO("fpu");

View File

@ -1,14 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# Arch-specific CryptoAPI modules.
#
OBJECT_FILES_NON_STANDARD := y
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
endif

File diff suppressed because it is too large Load Diff

View File

@ -1,134 +0,0 @@
/*
* Header file for multi buffer SHA context
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SHA_MB_CTX_INTERNAL_H
#define _SHA_MB_CTX_INTERNAL_H
#include "sha1_mb_mgr.h"
#define HASH_UPDATE 0x00
#define HASH_LAST 0x01
#define HASH_DONE 0x02
#define HASH_FINAL 0x04
#define HASH_CTX_STS_IDLE 0x00
#define HASH_CTX_STS_PROCESSING 0x01
#define HASH_CTX_STS_LAST 0x02
#define HASH_CTX_STS_COMPLETE 0x04
enum hash_ctx_error {
HASH_CTX_ERROR_NONE = 0,
HASH_CTX_ERROR_INVALID_FLAGS = -1,
HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
#ifdef HASH_CTX_DEBUG
HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
#endif
};
#define hash_ctx_user_data(ctx) ((ctx)->user_data)
#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
#define hash_ctx_status(ctx) ((ctx)->status)
#define hash_ctx_error(ctx) ((ctx)->error)
#define hash_ctx_init(ctx) \
do { \
(ctx)->error = HASH_CTX_ERROR_NONE; \
(ctx)->status = HASH_CTX_STS_COMPLETE; \
} while (0)
/* Hash Constants and Typedefs */
#define SHA1_DIGEST_LENGTH 5
#define SHA1_LOG2_BLOCK_SIZE 6
#define SHA1_PADLENGTHFIELD_SIZE 8
#ifdef SHA_MB_DEBUG
#define assert(expr) \
do { \
if (unlikely(!(expr))) { \
printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
#expr, __FILE__, __func__, __LINE__); \
} \
} while (0)
#else
#define assert(expr) do {} while (0)
#endif
struct sha1_ctx_mgr {
struct sha1_mb_mgr mgr;
};
/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
struct sha1_hash_ctx {
/* Must be at struct offset 0 */
struct job_sha1 job;
/* status flag */
int status;
/* error flag */
int error;
uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
uint32_t partial_block_buffer_length;
void *user_data;
};
#endif

View File

@ -1,110 +0,0 @@
/*
* Header file for multi buffer SHA1 algorithm manager
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SHA_MB_MGR_H
#define __SHA_MB_MGR_H
#include <linux/types.h>
#define NUM_SHA1_DIGEST_WORDS 5
enum job_sts { STS_UNKNOWN = 0,
STS_BEING_PROCESSED = 1,
STS_COMPLETED = 2,
STS_INTERNAL_ERROR = 3,
STS_ERROR = 4
};
struct job_sha1 {
u8 *buffer;
u32 len;
u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
enum job_sts status;
void *user_data;
};
/* SHA1 out-of-order scheduler */
/* typedef uint32_t sha1_digest_array[5][8]; */
struct sha1_args_x8 {
uint32_t digest[5][8];
uint8_t *data_ptr[8];
};
struct sha1_lane_data {
struct job_sha1 *job_in_lane;
};
struct sha1_mb_mgr {
struct sha1_args_x8 args;
uint32_t lens[8];
/* each byte is index (0...7) of unused lanes */
uint64_t unused_lanes;
/* byte 4 is set to FF as a flag */
struct sha1_lane_data ldata[8];
};
#define SHA1_MB_MGR_NUM_LANES_AVX2 8
void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
struct job_sha1 *job);
struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
#endif

View File

@ -1,287 +0,0 @@
/*
* Header file for multi buffer SHA1 algorithm data structure
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
# Macros for defining data structures
# Usage example
#START_FIELDS # JOB_AES
### name size align
#FIELD _plaintext, 8, 8 # pointer to plaintext
#FIELD _ciphertext, 8, 8 # pointer to ciphertext
#FIELD _IV, 16, 8 # IV
#FIELD _keys, 8, 8 # pointer to keys
#FIELD _len, 4, 4 # length in bytes
#FIELD _status, 4, 4 # status enumeration
#FIELD _user_data, 8, 8 # pointer to user data
#UNION _union, size1, align1, \
# size2, align2, \
# size3, align3, \
# ...
#END_FIELDS
#%assign _JOB_AES_size _FIELD_OFFSET
#%assign _JOB_AES_align _STRUCT_ALIGN
#########################################################################
# Alternate "struc-like" syntax:
# STRUCT job_aes2
# RES_Q .plaintext, 1
# RES_Q .ciphertext, 1
# RES_DQ .IV, 1
# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
# RES_U .union, size1, align1, \
# size2, align2, \
# ...
# ENDSTRUCT
# # Following only needed if nesting
# %assign job_aes2_size _FIELD_OFFSET
# %assign job_aes2_align _STRUCT_ALIGN
#
# RES_* macros take a name, a count and an optional alignment.
# The count in in terms of the base size of the macro, and the
# default alignment is the base size.
# The macros are:
# Macro Base size
# RES_B 1
# RES_W 2
# RES_D 4
# RES_Q 8
# RES_DQ 16
# RES_Y 32
# RES_Z 64
#
# RES_U defines a union. It's arguments are a name and two or more
# pairs of "size, alignment"
#
# The two assigns are only needed if this structure is being nested
# within another. Even if the assigns are not done, one can still use
# STRUCT_NAME_size as the size of the structure.
#
# Note that for nesting, you still need to assign to STRUCT_NAME_size.
#
# The differences between this and using "struc" directly are that each
# type is implicitly aligned to its natural length (although this can be
# over-ridden with an explicit third parameter), and that the structure
# is padded at the end to its overall alignment.
#
#########################################################################
#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
#define _SHA1_MB_MGR_DATASTRUCT_ASM_
## START_FIELDS
.macro START_FIELDS
_FIELD_OFFSET = 0
_STRUCT_ALIGN = 0
.endm
## FIELD name size align
.macro FIELD name size align
_FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
\name = _FIELD_OFFSET
_FIELD_OFFSET = _FIELD_OFFSET + (\size)
.if (\align > _STRUCT_ALIGN)
_STRUCT_ALIGN = \align
.endif
.endm
## END_FIELDS
.macro END_FIELDS
_FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
.endm
########################################################################
.macro STRUCT p1
START_FIELDS
.struc \p1
.endm
.macro ENDSTRUCT
tmp = _FIELD_OFFSET
END_FIELDS
tmp = (_FIELD_OFFSET - %%tmp)
.if (tmp > 0)
.lcomm tmp
.endif
.endstruc
.endm
## RES_int name size align
.macro RES_int p1 p2 p3
name = \p1
size = \p2
align = .\p3
_FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
.align align
.lcomm name size
_FIELD_OFFSET = _FIELD_OFFSET + (size)
.if (align > _STRUCT_ALIGN)
_STRUCT_ALIGN = align
.endif
.endm
# macro RES_B name, size [, align]
.macro RES_B _name, _size, _align=1
RES_int _name _size _align
.endm
# macro RES_W name, size [, align]
.macro RES_W _name, _size, _align=2
RES_int _name 2*(_size) _align
.endm
# macro RES_D name, size [, align]
.macro RES_D _name, _size, _align=4
RES_int _name 4*(_size) _align
.endm
# macro RES_Q name, size [, align]
.macro RES_Q _name, _size, _align=8
RES_int _name 8*(_size) _align
.endm
# macro RES_DQ name, size [, align]
.macro RES_DQ _name, _size, _align=16
RES_int _name 16*(_size) _align
.endm
# macro RES_Y name, size [, align]
.macro RES_Y _name, _size, _align=32
RES_int _name 32*(_size) _align
.endm
# macro RES_Z name, size [, align]
.macro RES_Z _name, _size, _align=64
RES_int _name 64*(_size) _align
.endm
#endif
########################################################################
#### Define constants
########################################################################
########################################################################
#### Define SHA1 Out Of Order Data Structures
########################################################################
START_FIELDS # LANE_DATA
### name size align
FIELD _job_in_lane, 8, 8 # pointer to job object
END_FIELDS
_LANE_DATA_size = _FIELD_OFFSET
_LANE_DATA_align = _STRUCT_ALIGN
########################################################################
START_FIELDS # SHA1_ARGS_X8
### name size align
FIELD _digest, 4*5*8, 16 # transposed digest
FIELD _data_ptr, 8*8, 8 # array of pointers to data
END_FIELDS
_SHA1_ARGS_X4_size = _FIELD_OFFSET
_SHA1_ARGS_X4_align = _STRUCT_ALIGN
_SHA1_ARGS_X8_size = _FIELD_OFFSET
_SHA1_ARGS_X8_align = _STRUCT_ALIGN
########################################################################
START_FIELDS # MB_MGR
### name size align
FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
FIELD _lens, 4*8, 8
FIELD _unused_lanes, 8, 8
FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
END_FIELDS
_MB_MGR_size = _FIELD_OFFSET
_MB_MGR_align = _STRUCT_ALIGN
_args_digest = _args + _digest
_args_data_ptr = _args + _data_ptr
########################################################################
#### Define constants
########################################################################
#define STS_UNKNOWN 0
#define STS_BEING_PROCESSED 1
#define STS_COMPLETED 2
########################################################################
#### Define JOB_SHA1 structure
########################################################################
START_FIELDS # JOB_SHA1
### name size align
FIELD _buffer, 8, 8 # pointer to buffer
FIELD _len, 4, 4 # length in bytes
FIELD _result_digest, 5*4, 32 # Digest (output)
FIELD _status, 4, 4
FIELD _user_data, 8, 8
END_FIELDS
_JOB_SHA1_size = _FIELD_OFFSET
_JOB_SHA1_align = _STRUCT_ALIGN

View File

@ -1,304 +0,0 @@
/*
* Flush routine for SHA1 multibuffer
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
.extern sha1_x8_avx2
# LINUX register definitions
#define arg1 %rdi
#define arg2 %rsi
# Common definitions
#define state arg1
#define job arg2
#define len2 arg2
# idx must be a register not clobbered by sha1_x8_avx2
#define idx %r8
#define DWORD_idx %r8d
#define unused_lanes %rbx
#define lane_data %rbx
#define tmp2 %rbx
#define tmp2_w %ebx
#define job_rax %rax
#define tmp1 %rax
#define size_offset %rax
#define tmp %rax
#define start_offset %rax
#define tmp3 %arg1
#define extra_blocks %arg2
#define p %arg2
.macro LABEL prefix n
\prefix\n\():
.endm
.macro JNE_SKIP i
jne skip_\i
.endm
.altmacro
.macro SET_OFFSET _offset
offset = \_offset
.endm
.noaltmacro
# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
# arg 1 : rcx : state
ENTRY(sha1_mb_mgr_flush_avx2)
FRAME_BEGIN
push %rbx
# If bit (32+3) is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
bt $32+3, unused_lanes
jc return_null
# find a lane with a non-null job
xor idx, idx
offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne one(%rip), idx
offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne two(%rip), idx
offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne three(%rip), idx
offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne four(%rip), idx
offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne five(%rip), idx
offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne six(%rip), idx
offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne seven(%rip), idx
# copy idx to empty lanes
copy_lane_data:
offset = (_args + _data_ptr)
mov offset(state,idx,8), tmp
I = 0
.rep 8
offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
.altmacro
JNE_SKIP %I
offset = (_args + _data_ptr + 8*I)
mov tmp, offset(state)
offset = (_lens + 4*I)
movl $0xFFFFFFFF, offset(state)
LABEL skip_ %I
I = (I+1)
.noaltmacro
.endr
# Find min length
vmovdqu _lens+0*16(state), %xmm0
vmovdqu _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
vmovd %xmm2, DWORD_idx
mov idx, len2
and $0xF, idx
shr $4, len2
jz len_is_0
vpand clear_low_nibble(%rip), %xmm2, %xmm2
vpshufd $0, %xmm2, %xmm2
vpsubd %xmm2, %xmm0, %xmm0
vpsubd %xmm2, %xmm1, %xmm1
vmovdqu %xmm0, _lens+0*16(state)
vmovdqu %xmm1, _lens+1*16(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha1_x8_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state, idx, 4)
vmovd _args_digest(state , idx, 4) , %xmm0
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
movl _args_digest+4*32(state, idx, 4), tmp2_w
vmovdqu %xmm0, _result_digest(job_rax)
offset = (_result_digest + 1*16)
mov tmp2_w, offset(job_rax)
return:
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha1_mb_mgr_flush_avx2)
#################################################################
.align 16
ENTRY(sha1_mb_mgr_get_comp_job_avx2)
push %rbx
## if bit 32+3 is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
bt $(32+3), unused_lanes
jc .return_null
# Find min length
vmovdqu _lens(state), %xmm0
vmovdqu _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
vmovd %xmm2, DWORD_idx
test $~0xF, idx
jnz .return_null
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state, idx, 4)
vmovd _args_digest(state, idx, 4), %xmm0
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
movl _args_digest+4*32(state, idx, 4), tmp2_w
vmovdqu %xmm0, _result_digest(job_rax)
movl tmp2_w, _result_digest+1*16(job_rax)
pop %rbx
ret
.return_null:
xor job_rax, job_rax
pop %rbx
ret
ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
.align 16
clear_low_nibble:
.octa 0x000000000000000000000000FFFFFFF0
.section .rodata.cst8, "aM", @progbits, 8
.align 8
one:
.quad 1
two:
.quad 2
three:
.quad 3
four:
.quad 4
five:
.quad 5
six:
.quad 6
seven:
.quad 7

View File

@ -1,64 +0,0 @@
/*
* Initialization code for multi buffer SHA1 algorithm for AVX2
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "sha1_mb_mgr.h"
void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
{
unsigned int j;
state->unused_lanes = 0xF76543210ULL;
for (j = 0; j < 8; j++) {
state->lens[j] = 0xFFFFFFFF;
state->ldata[j].job_in_lane = NULL;
}
}

View File

@ -1,209 +0,0 @@
/*
* Buffer submit code for multi buffer SHA1 algorithm
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
.extern sha1_x8_avx
# LINUX register definitions
arg1 = %rdi
arg2 = %rsi
size_offset = %rcx
tmp2 = %rcx
extra_blocks = %rdx
# Common definitions
#define state arg1
#define job %rsi
#define len2 arg2
#define p2 arg2
# idx must be a register not clobberred by sha1_x8_avx2
idx = %r8
DWORD_idx = %r8d
last_len = %r8
p = %r11
start_offset = %r11
unused_lanes = %rbx
BYTE_unused_lanes = %bl
job_rax = %rax
len = %rax
DWORD_len = %eax
lane = %r12
tmp3 = %r12
tmp = %r9
DWORD_tmp = %r9d
lane_data = %r10
# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
# arg 1 : rcx : state
# arg 2 : rdx : job
ENTRY(sha1_mb_mgr_submit_avx2)
FRAME_BEGIN
push %rbx
push %r12
mov _unused_lanes(state), unused_lanes
mov unused_lanes, lane
and $0xF, lane
shr $4, unused_lanes
imul $_LANE_DATA_size, lane, lane_data
movl $STS_BEING_PROCESSED, _status(job)
lea _ldata(state, lane_data), lane_data
mov unused_lanes, _unused_lanes(state)
movl _len(job), DWORD_len
mov job, _job_in_lane(lane_data)
shl $4, len
or lane, len
movl DWORD_len, _lens(state , lane, 4)
# Load digest words from result_digest
vmovdqu _result_digest(job), %xmm0
mov _result_digest+1*16(job), DWORD_tmp
vmovd %xmm0, _args_digest(state, lane, 4)
vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
mov _buffer(job), p
mov p, _args_data_ptr(state, lane, 8)
cmp $0xF, unused_lanes
jne return_null
start_loop:
# Find min length
vmovdqa _lens(state), %xmm0
vmovdqa _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
vmovd %xmm2, DWORD_idx
mov idx, len2
and $0xF, idx
shr $4, len2
jz len_is_0
vpand clear_low_nibble(%rip), %xmm2, %xmm2
vpshufd $0, %xmm2, %xmm2
vpsubd %xmm2, %xmm0, %xmm0
vpsubd %xmm2, %xmm1, %xmm1
vmovdqa %xmm0, _lens + 0*16(state)
vmovdqa %xmm1, _lens + 1*16(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha1_x8_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
mov _unused_lanes(state), unused_lanes
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state, idx, 4)
vmovd _args_digest(state, idx, 4), %xmm0
vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
movl _args_digest+4*32(state, idx, 4), DWORD_tmp
vmovdqu %xmm0, _result_digest(job_rax)
movl DWORD_tmp, _result_digest+1*16(job_rax)
return:
pop %r12
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha1_mb_mgr_submit_avx2)
.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
.align 16
clear_low_nibble:
.octa 0x000000000000000000000000FFFFFFF0

View File

@ -1,492 +0,0 @@
/*
* Multi-buffer SHA1 algorithm hash compute routine
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2014 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* James Guilford <james.guilford@intel.com>
* Tim Chen <tim.c.chen@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include "sha1_mb_mgr_datastruct.S"
## code to compute oct SHA1 using SSE-256
## outer calling routine takes care of save and restore of XMM registers
## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
##
## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
## Linux preserves: rdi rbp r8
##
## clobbers ymm0-15
# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
# "transpose" data in {r0...r7} using temps {t0...t1}
# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
#
# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
#
.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
# process top half (r0..r3) {a...d}
vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
# use r2 in place of t0
# process bottom half (r4..r7) {e...h}
vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
.endm
##
## Magic functions defined in FIPS 180-1
##
# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
.macro MAGIC_F0 regF regB regC regD regT
vpxor \regD, \regC, \regF
vpand \regB, \regF, \regF
vpxor \regD, \regF, \regF
.endm
# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
.macro MAGIC_F1 regF regB regC regD regT
vpxor \regC, \regD, \regF
vpxor \regB, \regF, \regF
.endm
# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
.macro MAGIC_F2 regF regB regC regD regT
vpor \regC, \regB, \regF
vpand \regC, \regB, \regT
vpand \regD, \regF, \regF
vpor \regT, \regF, \regF
.endm
# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
.macro MAGIC_F3 regF regB regC regD regT
MAGIC_F1 \regF,\regB,\regC,\regD,\regT
.endm
# PROLD reg, imm, tmp
.macro PROLD reg imm tmp
vpsrld $(32-\imm), \reg, \tmp
vpslld $\imm, \reg, \reg
vpor \tmp, \reg, \reg
.endm
.macro PROLD_nd reg imm tmp src
vpsrld $(32-\imm), \src, \tmp
vpslld $\imm, \src, \reg
vpor \tmp, \reg, \reg
.endm
.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
vpaddd \immCNT, \regE, \regE
vpaddd \memW*32(%rsp), \regE, \regE
PROLD_nd \regT, 5, \regF, \regA
vpaddd \regT, \regE, \regE
\MAGIC \regF, \regB, \regC, \regD, \regT
PROLD \regB, 30, \regT
vpaddd \regF, \regE, \regE
.endm
.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
vpaddd \immCNT, \regE, \regE
offset = ((\memW - 14) & 15) * 32
vmovdqu offset(%rsp), W14
vpxor W14, W16, W16
offset = ((\memW - 8) & 15) * 32
vpxor offset(%rsp), W16, W16
offset = ((\memW - 3) & 15) * 32
vpxor offset(%rsp), W16, W16
vpsrld $(32-1), W16, \regF
vpslld $1, W16, W16
vpor W16, \regF, \regF
ROTATE_W
offset = ((\memW - 0) & 15) * 32
vmovdqu \regF, offset(%rsp)
vpaddd \regF, \regE, \regE
PROLD_nd \regT, 5, \regF, \regA
vpaddd \regT, \regE, \regE
\MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
PROLD \regB,30, \regT
vpaddd \regF, \regE, \regE
.endm
########################################################################
########################################################################
########################################################################
## FRAMESZ plus pushes must be an odd multiple of 8
YMM_SAVE = (15-15)*32
FRAMESZ = 32*16 + YMM_SAVE
_YMM = FRAMESZ - YMM_SAVE
#define VMOVPS vmovups
IDX = %rax
inp0 = %r9
inp1 = %r10
inp2 = %r11
inp3 = %r12
inp4 = %r13
inp5 = %r14
inp6 = %r15
inp7 = %rcx
arg1 = %rdi
arg2 = %rsi
RSP_SAVE = %rdx
# ymm0 A
# ymm1 B
# ymm2 C
# ymm3 D
# ymm4 E
# ymm5 F AA
# ymm6 T0 BB
# ymm7 T1 CC
# ymm8 T2 DD
# ymm9 T3 EE
# ymm10 T4 TMP
# ymm11 T5 FUN
# ymm12 T6 K
# ymm13 T7 W14
# ymm14 T8 W15
# ymm15 T9 W16
A = %ymm0
B = %ymm1
C = %ymm2
D = %ymm3
E = %ymm4
F = %ymm5
T0 = %ymm6
T1 = %ymm7
T2 = %ymm8
T3 = %ymm9
T4 = %ymm10
T5 = %ymm11
T6 = %ymm12
T7 = %ymm13
T8 = %ymm14
T9 = %ymm15
AA = %ymm5
BB = %ymm6
CC = %ymm7
DD = %ymm8
EE = %ymm9
TMP = %ymm10
FUN = %ymm11
K = %ymm12
W14 = %ymm13
W15 = %ymm14
W16 = %ymm15
.macro ROTATE_ARGS
TMP_ = E
E = D
D = C
C = B
B = A
A = TMP_
.endm
.macro ROTATE_W
TMP_ = W16
W16 = W15
W15 = W14
W14 = TMP_
.endm
# 8 streams x 5 32bit words per digest x 4 bytes per word
#define DIGEST_SIZE (8*5*4)
.align 32
# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
# arg 1 : pointer to array[4] of pointer to input data
# arg 2 : size (in blocks) ;; assumed to be >= 1
#
ENTRY(sha1_x8_avx2)
# save callee-saved clobbered registers to comply with C function ABI
push %r12
push %r13
push %r14
push %r15
#save rsp
mov %rsp, RSP_SAVE
sub $FRAMESZ, %rsp
#align rsp to 32 Bytes
and $~0x1F, %rsp
## Initialize digests
vmovdqu 0*32(arg1), A
vmovdqu 1*32(arg1), B
vmovdqu 2*32(arg1), C
vmovdqu 3*32(arg1), D
vmovdqu 4*32(arg1), E
## transpose input onto stack
mov _data_ptr+0*8(arg1),inp0
mov _data_ptr+1*8(arg1),inp1
mov _data_ptr+2*8(arg1),inp2
mov _data_ptr+3*8(arg1),inp3
mov _data_ptr+4*8(arg1),inp4
mov _data_ptr+5*8(arg1),inp5
mov _data_ptr+6*8(arg1),inp6
mov _data_ptr+7*8(arg1),inp7
xor IDX, IDX
lloop:
vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
I=0
.rep 2
VMOVPS (inp0, IDX), T0
VMOVPS (inp1, IDX), T1
VMOVPS (inp2, IDX), T2
VMOVPS (inp3, IDX), T3
VMOVPS (inp4, IDX), T4
VMOVPS (inp5, IDX), T5
VMOVPS (inp6, IDX), T6
VMOVPS (inp7, IDX), T7
TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
vpshufb F, T0, T0
vmovdqu T0, (I*8)*32(%rsp)
vpshufb F, T1, T1
vmovdqu T1, (I*8+1)*32(%rsp)
vpshufb F, T2, T2
vmovdqu T2, (I*8+2)*32(%rsp)
vpshufb F, T3, T3
vmovdqu T3, (I*8+3)*32(%rsp)
vpshufb F, T4, T4
vmovdqu T4, (I*8+4)*32(%rsp)
vpshufb F, T5, T5
vmovdqu T5, (I*8+5)*32(%rsp)
vpshufb F, T6, T6
vmovdqu T6, (I*8+6)*32(%rsp)
vpshufb F, T7, T7
vmovdqu T7, (I*8+7)*32(%rsp)
add $32, IDX
I = (I+1)
.endr
# save old digests
vmovdqu A,AA
vmovdqu B,BB
vmovdqu C,CC
vmovdqu D,DD
vmovdqu E,EE
##
## perform 0-79 steps
##
vmovdqu K00_19(%rip), K
## do rounds 0...15
I = 0
.rep 16
SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
ROTATE_ARGS
I = (I+1)
.endr
## do rounds 16...19
vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
.rep 4
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
ROTATE_ARGS
I = (I+1)
.endr
## do rounds 20...39
vmovdqu K20_39(%rip), K
.rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
ROTATE_ARGS
I = (I+1)
.endr
## do rounds 40...59
vmovdqu K40_59(%rip), K
.rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
ROTATE_ARGS
I = (I+1)
.endr
## do rounds 60...79
vmovdqu K60_79(%rip), K
.rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
ROTATE_ARGS
I = (I+1)
.endr
vpaddd AA,A,A
vpaddd BB,B,B
vpaddd CC,C,C
vpaddd DD,D,D
vpaddd EE,E,E
sub $1, arg2
jne lloop
# write out digests
vmovdqu A, 0*32(arg1)
vmovdqu B, 1*32(arg1)
vmovdqu C, 2*32(arg1)
vmovdqu D, 3*32(arg1)
vmovdqu E, 4*32(arg1)
# update input pointers
add IDX, inp0
add IDX, inp1
add IDX, inp2
add IDX, inp3
add IDX, inp4
add IDX, inp5
add IDX, inp6
add IDX, inp7
mov inp0, _data_ptr (arg1)
mov inp1, _data_ptr + 1*8(arg1)
mov inp2, _data_ptr + 2*8(arg1)
mov inp3, _data_ptr + 3*8(arg1)
mov inp4, _data_ptr + 4*8(arg1)
mov inp5, _data_ptr + 5*8(arg1)
mov inp6, _data_ptr + 6*8(arg1)
mov inp7, _data_ptr + 7*8(arg1)
################
## Postamble
mov RSP_SAVE, %rsp
# restore callee-saved clobbered registers
pop %r15
pop %r14
pop %r13
pop %r12
ret
ENDPROC(sha1_x8_avx2)
.section .rodata.cst32.K00_19, "aM", @progbits, 32
.align 32
K00_19:
.octa 0x5A8279995A8279995A8279995A827999
.octa 0x5A8279995A8279995A8279995A827999
.section .rodata.cst32.K20_39, "aM", @progbits, 32
.align 32
K20_39:
.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
.section .rodata.cst32.K40_59, "aM", @progbits, 32
.align 32
K40_59:
.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
.section .rodata.cst32.K60_79, "aM", @progbits, 32
.align 32
K60_79:
.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
.align 32
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203
.octa 0x0c0d0e0f08090a0b0405060700010203

View File

@ -1,14 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# Arch-specific CryptoAPI modules.
#
OBJECT_FILES_NON_STANDARD := y
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
endif

File diff suppressed because it is too large Load Diff

View File

@ -1,134 +0,0 @@
/*
* Header file for multi buffer SHA256 context
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SHA_MB_CTX_INTERNAL_H
#define _SHA_MB_CTX_INTERNAL_H
#include "sha256_mb_mgr.h"
#define HASH_UPDATE 0x00
#define HASH_LAST 0x01
#define HASH_DONE 0x02
#define HASH_FINAL 0x04
#define HASH_CTX_STS_IDLE 0x00
#define HASH_CTX_STS_PROCESSING 0x01
#define HASH_CTX_STS_LAST 0x02
#define HASH_CTX_STS_COMPLETE 0x04
enum hash_ctx_error {
HASH_CTX_ERROR_NONE = 0,
HASH_CTX_ERROR_INVALID_FLAGS = -1,
HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
#ifdef HASH_CTX_DEBUG
HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
#endif
};
#define hash_ctx_user_data(ctx) ((ctx)->user_data)
#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
#define hash_ctx_status(ctx) ((ctx)->status)
#define hash_ctx_error(ctx) ((ctx)->error)
#define hash_ctx_init(ctx) \
do { \
(ctx)->error = HASH_CTX_ERROR_NONE; \
(ctx)->status = HASH_CTX_STS_COMPLETE; \
} while (0)
/* Hash Constants and Typedefs */
#define SHA256_DIGEST_LENGTH 8
#define SHA256_LOG2_BLOCK_SIZE 6
#define SHA256_PADLENGTHFIELD_SIZE 8
#ifdef SHA_MB_DEBUG
#define assert(expr) \
do { \
if (unlikely(!(expr))) { \
printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
#expr, __FILE__, __func__, __LINE__); \
} \
} while (0)
#else
#define assert(expr) do {} while (0)
#endif
struct sha256_ctx_mgr {
struct sha256_mb_mgr mgr;
};
/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
struct sha256_hash_ctx {
/* Must be at struct offset 0 */
struct job_sha256 job;
/* status flag */
int status;
/* error flag */
int error;
uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2];
uint32_t partial_block_buffer_length;
void *user_data;
};
#endif

View File

@ -1,108 +0,0 @@
/*
* Header file for multi buffer SHA256 algorithm manager
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SHA_MB_MGR_H
#define __SHA_MB_MGR_H
#include <linux/types.h>
#define NUM_SHA256_DIGEST_WORDS 8
enum job_sts { STS_UNKNOWN = 0,
STS_BEING_PROCESSED = 1,
STS_COMPLETED = 2,
STS_INTERNAL_ERROR = 3,
STS_ERROR = 4
};
struct job_sha256 {
u8 *buffer;
u32 len;
u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
enum job_sts status;
void *user_data;
};
/* SHA256 out-of-order scheduler */
/* typedef uint32_t sha8_digest_array[8][8]; */
struct sha256_args_x8 {
uint32_t digest[8][8];
uint8_t *data_ptr[8];
};
struct sha256_lane_data {
struct job_sha256 *job_in_lane;
};
struct sha256_mb_mgr {
struct sha256_args_x8 args;
uint32_t lens[8];
/* each byte is index (0...7) of unused lanes */
uint64_t unused_lanes;
/* byte 4 is set to FF as a flag */
struct sha256_lane_data ldata[8];
};
#define SHA256_MB_MGR_NUM_LANES_AVX2 8
void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
struct job_sha256 *job);
struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
#endif

View File

@ -1,304 +0,0 @@
/*
* Header file for multi buffer SHA256 algorithm data structure
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
# Macros for defining data structures
# Usage example
#START_FIELDS # JOB_AES
### name size align
#FIELD _plaintext, 8, 8 # pointer to plaintext
#FIELD _ciphertext, 8, 8 # pointer to ciphertext
#FIELD _IV, 16, 8 # IV
#FIELD _keys, 8, 8 # pointer to keys
#FIELD _len, 4, 4 # length in bytes
#FIELD _status, 4, 4 # status enumeration
#FIELD _user_data, 8, 8 # pointer to user data
#UNION _union, size1, align1, \
# size2, align2, \
# size3, align3, \
# ...
#END_FIELDS
#%assign _JOB_AES_size _FIELD_OFFSET
#%assign _JOB_AES_align _STRUCT_ALIGN
#########################################################################
# Alternate "struc-like" syntax:
# STRUCT job_aes2
# RES_Q .plaintext, 1
# RES_Q .ciphertext, 1
# RES_DQ .IV, 1
# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
# RES_U .union, size1, align1, \
# size2, align2, \
# ...
# ENDSTRUCT
# # Following only needed if nesting
# %assign job_aes2_size _FIELD_OFFSET
# %assign job_aes2_align _STRUCT_ALIGN
#
# RES_* macros take a name, a count and an optional alignment.
# The count in in terms of the base size of the macro, and the
# default alignment is the base size.
# The macros are:
# Macro Base size
# RES_B 1
# RES_W 2
# RES_D 4
# RES_Q 8
# RES_DQ 16
# RES_Y 32
# RES_Z 64
#
# RES_U defines a union. It's arguments are a name and two or more
# pairs of "size, alignment"
#
# The two assigns are only needed if this structure is being nested
# within another. Even if the assigns are not done, one can still use
# STRUCT_NAME_size as the size of the structure.
#
# Note that for nesting, you still need to assign to STRUCT_NAME_size.
#
# The differences between this and using "struc" directly are that each
# type is implicitly aligned to its natural length (although this can be
# over-ridden with an explicit third parameter), and that the structure
# is padded at the end to its overall alignment.
#
#########################################################################
#ifndef _DATASTRUCT_ASM_
#define _DATASTRUCT_ASM_
#define SZ8 8*SHA256_DIGEST_WORD_SIZE
#define ROUNDS 64*SZ8
#define PTR_SZ 8
#define SHA256_DIGEST_WORD_SIZE 4
#define MAX_SHA256_LANES 8
#define SHA256_DIGEST_WORDS 8
#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
#define SHA256_BLK_SZ 64
# START_FIELDS
.macro START_FIELDS
_FIELD_OFFSET = 0
_STRUCT_ALIGN = 0
.endm
# FIELD name size align
.macro FIELD name size align
_FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
\name = _FIELD_OFFSET
_FIELD_OFFSET = _FIELD_OFFSET + (\size)
.if (\align > _STRUCT_ALIGN)
_STRUCT_ALIGN = \align
.endif
.endm
# END_FIELDS
.macro END_FIELDS
_FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
.endm
########################################################################
.macro STRUCT p1
START_FIELDS
.struc \p1
.endm
.macro ENDSTRUCT
tmp = _FIELD_OFFSET
END_FIELDS
tmp = (_FIELD_OFFSET - %%tmp)
.if (tmp > 0)
.lcomm tmp
.endif
.endstruc
.endm
## RES_int name size align
.macro RES_int p1 p2 p3
name = \p1
size = \p2
align = .\p3
_FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
.align align
.lcomm name size
_FIELD_OFFSET = _FIELD_OFFSET + (size)
.if (align > _STRUCT_ALIGN)
_STRUCT_ALIGN = align
.endif
.endm
# macro RES_B name, size [, align]
.macro RES_B _name, _size, _align=1
RES_int _name _size _align
.endm
# macro RES_W name, size [, align]
.macro RES_W _name, _size, _align=2
RES_int _name 2*(_size) _align
.endm
# macro RES_D name, size [, align]
.macro RES_D _name, _size, _align=4
RES_int _name 4*(_size) _align
.endm
# macro RES_Q name, size [, align]
.macro RES_Q _name, _size, _align=8
RES_int _name 8*(_size) _align
.endm
# macro RES_DQ name, size [, align]
.macro RES_DQ _name, _size, _align=16
RES_int _name 16*(_size) _align
.endm
# macro RES_Y name, size [, align]
.macro RES_Y _name, _size, _align=32
RES_int _name 32*(_size) _align
.endm
# macro RES_Z name, size [, align]
.macro RES_Z _name, _size, _align=64
RES_int _name 64*(_size) _align
.endm
#endif
########################################################################
#### Define SHA256 Out Of Order Data Structures
########################################################################
START_FIELDS # LANE_DATA
### name size align
FIELD _job_in_lane, 8, 8 # pointer to job object
END_FIELDS
_LANE_DATA_size = _FIELD_OFFSET
_LANE_DATA_align = _STRUCT_ALIGN
########################################################################
START_FIELDS # SHA256_ARGS_X4
### name size align
FIELD _digest, 4*8*8, 4 # transposed digest
FIELD _data_ptr, 8*8, 8 # array of pointers to data
END_FIELDS
_SHA256_ARGS_X4_size = _FIELD_OFFSET
_SHA256_ARGS_X4_align = _STRUCT_ALIGN
_SHA256_ARGS_X8_size = _FIELD_OFFSET
_SHA256_ARGS_X8_align = _STRUCT_ALIGN
#######################################################################
START_FIELDS # MB_MGR
### name size align
FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
FIELD _lens, 4*8, 8
FIELD _unused_lanes, 8, 8
FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
END_FIELDS
_MB_MGR_size = _FIELD_OFFSET
_MB_MGR_align = _STRUCT_ALIGN
_args_digest = _args + _digest
_args_data_ptr = _args + _data_ptr
#######################################################################
START_FIELDS #STACK_FRAME
### name size align
FIELD _data, 16*SZ8, 1 # transposed digest
FIELD _digest, 8*SZ8, 1 # array of pointers to data
FIELD _ytmp, 4*SZ8, 1
FIELD _rsp, 8, 1
END_FIELDS
_STACK_FRAME_size = _FIELD_OFFSET
_STACK_FRAME_align = _STRUCT_ALIGN
#######################################################################
########################################################################
#### Define constants
########################################################################
#define STS_UNKNOWN 0
#define STS_BEING_PROCESSED 1
#define STS_COMPLETED 2
########################################################################
#### Define JOB_SHA256 structure
########################################################################
START_FIELDS # JOB_SHA256
### name size align
FIELD _buffer, 8, 8 # pointer to buffer
FIELD _len, 8, 8 # length in bytes
FIELD _result_digest, 8*4, 32 # Digest (output)
FIELD _status, 4, 4
FIELD _user_data, 8, 8
END_FIELDS
_JOB_SHA256_size = _FIELD_OFFSET
_JOB_SHA256_align = _STRUCT_ALIGN

View File

@ -1,307 +0,0 @@
/*
* Flush routine for SHA256 multibuffer
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha256_mb_mgr_datastruct.S"
.extern sha256_x8_avx2
#LINUX register definitions
#define arg1 %rdi
#define arg2 %rsi
# Common register definitions
#define state arg1
#define job arg2
#define len2 arg2
# idx must be a register not clobberred by sha1_mult
#define idx %r8
#define DWORD_idx %r8d
#define unused_lanes %rbx
#define lane_data %rbx
#define tmp2 %rbx
#define tmp2_w %ebx
#define job_rax %rax
#define tmp1 %rax
#define size_offset %rax
#define tmp %rax
#define start_offset %rax
#define tmp3 %arg1
#define extra_blocks %arg2
#define p %arg2
.macro LABEL prefix n
\prefix\n\():
.endm
.macro JNE_SKIP i
jne skip_\i
.endm
.altmacro
.macro SET_OFFSET _offset
offset = \_offset
.endm
.noaltmacro
# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
# arg 1 : rcx : state
ENTRY(sha256_mb_mgr_flush_avx2)
FRAME_BEGIN
push %rbx
# If bit (32+3) is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
bt $32+3, unused_lanes
jc return_null
# find a lane with a non-null job
xor idx, idx
offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne one(%rip), idx
offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne two(%rip), idx
offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne three(%rip), idx
offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne four(%rip), idx
offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne five(%rip), idx
offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne six(%rip), idx
offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne seven(%rip), idx
# copy idx to empty lanes
copy_lane_data:
offset = (_args + _data_ptr)
mov offset(state,idx,8), tmp
I = 0
.rep 8
offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
.altmacro
JNE_SKIP %I
offset = (_args + _data_ptr + 8*I)
mov tmp, offset(state)
offset = (_lens + 4*I)
movl $0xFFFFFFFF, offset(state)
LABEL skip_ %I
I = (I+1)
.noaltmacro
.endr
# Find min length
vmovdqu _lens+0*16(state), %xmm0
vmovdqu _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
vmovd %xmm2, DWORD_idx
mov idx, len2
and $0xF, idx
shr $4, len2
jz len_is_0
vpand clear_low_nibble(%rip), %xmm2, %xmm2
vpshufd $0, %xmm2, %xmm2
vpsubd %xmm2, %xmm0, %xmm0
vpsubd %xmm2, %xmm1, %xmm1
vmovdqu %xmm0, _lens+0*16(state)
vmovdqu %xmm1, _lens+1*16(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha256_x8_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state,idx,4)
vmovd _args_digest(state , idx, 4) , %xmm0
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
vmovd _args_digest+4*32(state, idx, 4), %xmm1
vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
vmovdqu %xmm0, _result_digest(job_rax)
offset = (_result_digest + 1*16)
vmovdqu %xmm1, offset(job_rax)
return:
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha256_mb_mgr_flush_avx2)
##############################################################################
.align 16
ENTRY(sha256_mb_mgr_get_comp_job_avx2)
push %rbx
## if bit 32+3 is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
bt $(32+3), unused_lanes
jc .return_null
# Find min length
vmovdqu _lens(state), %xmm0
vmovdqu _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
vmovd %xmm2, DWORD_idx
test $~0xF, idx
jnz .return_null
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state, idx, 4)
vmovd _args_digest(state, idx, 4), %xmm0
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
vmovd _args_digest+4*32(state, idx, 4), %xmm1
vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
vmovdqu %xmm0, _result_digest(job_rax)
offset = (_result_digest + 1*16)
vmovdqu %xmm1, offset(job_rax)
pop %rbx
ret
.return_null:
xor job_rax, job_rax
pop %rbx
ret
ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
.align 16
clear_low_nibble:
.octa 0x000000000000000000000000FFFFFFF0
.section .rodata.cst8, "aM", @progbits, 8
.align 8
one:
.quad 1
two:
.quad 2
three:
.quad 3
four:
.quad 4
five:
.quad 5
six:
.quad 6
seven:
.quad 7

View File

@ -1,65 +0,0 @@
/*
* Initialization code for multi buffer SHA256 algorithm for AVX2
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "sha256_mb_mgr.h"
void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
{
unsigned int j;
state->unused_lanes = 0xF76543210ULL;
for (j = 0; j < 8; j++) {
state->lens[j] = 0xFFFFFFFF;
state->ldata[j].job_in_lane = NULL;
}
}

View File

@ -1,214 +0,0 @@
/*
* Buffer submit code for multi buffer SHA256 algorithm
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha256_mb_mgr_datastruct.S"
.extern sha256_x8_avx2
# LINUX register definitions
arg1 = %rdi
arg2 = %rsi
size_offset = %rcx
tmp2 = %rcx
extra_blocks = %rdx
# Common definitions
#define state arg1
#define job %rsi
#define len2 arg2
#define p2 arg2
# idx must be a register not clobberred by sha1_x8_avx2
idx = %r8
DWORD_idx = %r8d
last_len = %r8
p = %r11
start_offset = %r11
unused_lanes = %rbx
BYTE_unused_lanes = %bl
job_rax = %rax
len = %rax
DWORD_len = %eax
lane = %r12
tmp3 = %r12
tmp = %r9
DWORD_tmp = %r9d
lane_data = %r10
# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
# arg 1 : rcx : state
# arg 2 : rdx : job
ENTRY(sha256_mb_mgr_submit_avx2)
FRAME_BEGIN
push %rbx
push %r12
mov _unused_lanes(state), unused_lanes
mov unused_lanes, lane
and $0xF, lane
shr $4, unused_lanes
imul $_LANE_DATA_size, lane, lane_data
movl $STS_BEING_PROCESSED, _status(job)
lea _ldata(state, lane_data), lane_data
mov unused_lanes, _unused_lanes(state)
movl _len(job), DWORD_len
mov job, _job_in_lane(lane_data)
shl $4, len
or lane, len
movl DWORD_len, _lens(state , lane, 4)
# Load digest words from result_digest
vmovdqu _result_digest(job), %xmm0
vmovdqu _result_digest+1*16(job), %xmm1
vmovd %xmm0, _args_digest(state, lane, 4)
vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
vmovd %xmm1, _args_digest+4*32(state , lane, 4)
vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
mov _buffer(job), p
mov p, _args_data_ptr(state, lane, 8)
cmp $0xF, unused_lanes
jne return_null
start_loop:
# Find min length
vmovdqa _lens(state), %xmm0
vmovdqa _lens+1*16(state), %xmm1
vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
vmovd %xmm2, DWORD_idx
mov idx, len2
and $0xF, idx
shr $4, len2
jz len_is_0
vpand clear_low_nibble(%rip), %xmm2, %xmm2
vpshufd $0, %xmm2, %xmm2
vpsubd %xmm2, %xmm0, %xmm0
vpsubd %xmm2, %xmm1, %xmm1
vmovdqa %xmm0, _lens + 0*16(state)
vmovdqa %xmm1, _lens + 1*16(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha256_x8_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
mov _unused_lanes(state), unused_lanes
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
shl $4, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens(state,idx,4)
vmovd _args_digest(state, idx, 4), %xmm0
vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
vmovd _args_digest+4*32(state, idx, 4), %xmm1
vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
vmovdqu %xmm0, _result_digest(job_rax)
vmovdqu %xmm1, _result_digest+1*16(job_rax)
return:
pop %r12
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha256_mb_mgr_submit_avx2)
.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
.align 16
clear_low_nibble:
.octa 0x000000000000000000000000FFFFFFF0

View File

@ -1,598 +0,0 @@
/*
* Multi-buffer SHA256 algorithm hash compute routine
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include "sha256_mb_mgr_datastruct.S"
## code to compute oct SHA256 using SSE-256
## outer calling routine takes care of save and restore of XMM registers
## Logic designed/laid out by JDG
## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15
## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
## Linux preserves: rdi rbp r8
##
## clobbers %ymm0-15
arg1 = %rdi
arg2 = %rsi
reg3 = %rcx
reg4 = %rdx
# Common definitions
STATE = arg1
INP_SIZE = arg2
IDX = %rax
ROUND = %rbx
TBL = reg3
inp0 = %r9
inp1 = %r10
inp2 = %r11
inp3 = %r12
inp4 = %r13
inp5 = %r14
inp6 = %r15
inp7 = reg4
a = %ymm0
b = %ymm1
c = %ymm2
d = %ymm3
e = %ymm4
f = %ymm5
g = %ymm6
h = %ymm7
T1 = %ymm8
a0 = %ymm12
a1 = %ymm13
a2 = %ymm14
TMP = %ymm15
TMP0 = %ymm6
TMP1 = %ymm7
TT0 = %ymm8
TT1 = %ymm9
TT2 = %ymm10
TT3 = %ymm11
TT4 = %ymm12
TT5 = %ymm13
TT6 = %ymm14
TT7 = %ymm15
# Define stack usage
# Assume stack aligned to 32 bytes before call
# Therefore FRAMESZ mod 32 must be 32-8 = 24
#define FRAMESZ 0x388
#define VMOVPS vmovups
# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
# "transpose" data in {r0...r7} using temps {t0...t1}
# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
#
# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
#
.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
# process top half (r0..r3) {a...d}
vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
# use r2 in place of t0
# process bottom half (r4..r7) {e...h}
vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
.endm
.macro ROTATE_ARGS
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = TMP_
.endm
.macro _PRORD reg imm tmp
vpslld $(32-\imm),\reg,\tmp
vpsrld $\imm,\reg, \reg
vpor \tmp,\reg, \reg
.endm
# PRORD_nd reg, imm, tmp, src
.macro _PRORD_nd reg imm tmp src
vpslld $(32-\imm), \src, \tmp
vpsrld $\imm, \src, \reg
vpor \tmp, \reg, \reg
.endm
# PRORD dst/src, amt
.macro PRORD reg imm
_PRORD \reg,\imm,TMP
.endm
# PRORD_nd dst, src, amt
.macro PRORD_nd reg tmp imm
_PRORD_nd \reg, \imm, TMP, \tmp
.endm
# arguments passed implicitly in preprocessor symbols i, a...h
.macro ROUND_00_15 _T1 i
PRORD_nd a0,e,5 # sig1: a0 = (e >> 5)
vpxor g, f, a2 # ch: a2 = f^g
vpand e,a2, a2 # ch: a2 = (f^g)&e
vpxor g, a2, a2 # a2 = ch
PRORD_nd a1,e,25 # sig1: a1 = (e >> 25)
vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp)
vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddd a2, h, h # h = h + ch
PRORD_nd a2,a,11 # sig0: a2 = (a >> 11)
vpaddd \_T1,h, h # h = h + ch + W + K
vpxor a1, a0, a0 # a0 = sigma1
PRORD_nd a1,a,22 # sig0: a1 = (a >> 22)
vpxor c, a, \_T1 # maj: T1 = a^c
add $SZ8, ROUND # ROUND++
vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
vpaddd a0, h, h
vpaddd h, d, d
vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a1, a2, a2 # a2 = sig0
vpand c, a, a1 # maj: a1 = a&c
vpor \_T1, a1, a1 # a1 = maj
vpaddd a1, h, h # h = h + ch + W + K + maj
vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
.endm
# arguments passed implicitly in preprocessor symbols i, a...h
.macro ROUND_16_XX _T1 i
vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1
vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1
vmovdqu \_T1, a0
PRORD \_T1,11
vmovdqu a1, a2
PRORD a1,2
vpxor a0, \_T1, \_T1
PRORD \_T1, 7
vpxor a2, a1, a1
PRORD a1, 17
vpsrld $3, a0, a0
vpxor a0, \_T1, \_T1
vpsrld $10, a2, a2
vpxor a2, a1, a1
vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1
vpaddd a1, \_T1, \_T1
ROUND_00_15 \_T1,\i
.endm
# SHA256_ARGS:
# UINT128 digest[8]; // transposed digests
# UINT8 *data_ptr[4];
# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
# arg 1 : STATE : pointer to array of pointers to input data
# arg 2 : INP_SIZE : size of input in blocks
# general registers preserved in outer calling routine
# outer calling routine saves all the XMM registers
# save rsp, allocate 32-byte aligned for local variables
ENTRY(sha256_x8_avx2)
# save callee-saved clobbered registers to comply with C function ABI
push %r12
push %r13
push %r14
push %r15
mov %rsp, IDX
sub $FRAMESZ, %rsp
and $~0x1F, %rsp
mov IDX, _rsp(%rsp)
# Load the pre-transposed incoming digest.
vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a
vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b
vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c
vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d
vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e
vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f
vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g
vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h
lea K256_8(%rip),TBL
# load the address of each of the 4 message lanes
# getting ready to transpose input onto stack
mov _args_data_ptr+0*PTR_SZ(STATE),inp0
mov _args_data_ptr+1*PTR_SZ(STATE),inp1
mov _args_data_ptr+2*PTR_SZ(STATE),inp2
mov _args_data_ptr+3*PTR_SZ(STATE),inp3
mov _args_data_ptr+4*PTR_SZ(STATE),inp4
mov _args_data_ptr+5*PTR_SZ(STATE),inp5
mov _args_data_ptr+6*PTR_SZ(STATE),inp6
mov _args_data_ptr+7*PTR_SZ(STATE),inp7
xor IDX, IDX
lloop:
xor ROUND, ROUND
# save old digest
vmovdqu a, _digest(%rsp)
vmovdqu b, _digest+1*SZ8(%rsp)
vmovdqu c, _digest+2*SZ8(%rsp)
vmovdqu d, _digest+3*SZ8(%rsp)
vmovdqu e, _digest+4*SZ8(%rsp)
vmovdqu f, _digest+5*SZ8(%rsp)
vmovdqu g, _digest+6*SZ8(%rsp)
vmovdqu h, _digest+7*SZ8(%rsp)
i = 0
.rep 2
VMOVPS i*32(inp0, IDX), TT0
VMOVPS i*32(inp1, IDX), TT1
VMOVPS i*32(inp2, IDX), TT2
VMOVPS i*32(inp3, IDX), TT3
VMOVPS i*32(inp4, IDX), TT4
VMOVPS i*32(inp5, IDX), TT5
VMOVPS i*32(inp6, IDX), TT6
VMOVPS i*32(inp7, IDX), TT7
vmovdqu g, _ytmp(%rsp)
vmovdqu h, _ytmp+1*SZ8(%rsp)
TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
vmovdqu _ytmp(%rsp), g
vpshufb TMP1, TT0, TT0
vpshufb TMP1, TT1, TT1
vpshufb TMP1, TT2, TT2
vpshufb TMP1, TT3, TT3
vpshufb TMP1, TT4, TT4
vpshufb TMP1, TT5, TT5
vpshufb TMP1, TT6, TT6
vpshufb TMP1, TT7, TT7
vmovdqu _ytmp+1*SZ8(%rsp), h
vmovdqu TT4, _ytmp(%rsp)
vmovdqu TT5, _ytmp+1*SZ8(%rsp)
vmovdqu TT6, _ytmp+2*SZ8(%rsp)
vmovdqu TT7, _ytmp+3*SZ8(%rsp)
ROUND_00_15 TT0,(i*8+0)
vmovdqu _ytmp(%rsp), TT0
ROUND_00_15 TT1,(i*8+1)
vmovdqu _ytmp+1*SZ8(%rsp), TT1
ROUND_00_15 TT2,(i*8+2)
vmovdqu _ytmp+2*SZ8(%rsp), TT2
ROUND_00_15 TT3,(i*8+3)
vmovdqu _ytmp+3*SZ8(%rsp), TT3
ROUND_00_15 TT0,(i*8+4)
ROUND_00_15 TT1,(i*8+5)
ROUND_00_15 TT2,(i*8+6)
ROUND_00_15 TT3,(i*8+7)
i = (i+1)
.endr
add $64, IDX
i = (i*8)
jmp Lrounds_16_xx
.align 16
Lrounds_16_xx:
.rep 16
ROUND_16_XX T1, i
i = (i+1)
.endr
cmp $ROUNDS,ROUND
jb Lrounds_16_xx
# add old digest
vpaddd _digest+0*SZ8(%rsp), a, a
vpaddd _digest+1*SZ8(%rsp), b, b
vpaddd _digest+2*SZ8(%rsp), c, c
vpaddd _digest+3*SZ8(%rsp), d, d
vpaddd _digest+4*SZ8(%rsp), e, e
vpaddd _digest+5*SZ8(%rsp), f, f
vpaddd _digest+6*SZ8(%rsp), g, g
vpaddd _digest+7*SZ8(%rsp), h, h
sub $1, INP_SIZE # unit is blocks
jne lloop
# write back to memory (state object) the transposed digest
vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
# update input pointers
add IDX, inp0
mov inp0, _args_data_ptr+0*8(STATE)
add IDX, inp1
mov inp1, _args_data_ptr+1*8(STATE)
add IDX, inp2
mov inp2, _args_data_ptr+2*8(STATE)
add IDX, inp3
mov inp3, _args_data_ptr+3*8(STATE)
add IDX, inp4
mov inp4, _args_data_ptr+4*8(STATE)
add IDX, inp5
mov inp5, _args_data_ptr+5*8(STATE)
add IDX, inp6
mov inp6, _args_data_ptr+6*8(STATE)
add IDX, inp7
mov inp7, _args_data_ptr+7*8(STATE)
# Postamble
mov _rsp(%rsp), %rsp
# restore callee-saved clobbered registers
pop %r15
pop %r14
pop %r13
pop %r12
ret
ENDPROC(sha256_x8_avx2)
.section .rodata.K256_8, "a", @progbits
.align 64
K256_8:
.octa 0x428a2f98428a2f98428a2f98428a2f98
.octa 0x428a2f98428a2f98428a2f98428a2f98
.octa 0x71374491713744917137449171374491
.octa 0x71374491713744917137449171374491
.octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
.octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
.octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
.octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
.octa 0x3956c25b3956c25b3956c25b3956c25b
.octa 0x3956c25b3956c25b3956c25b3956c25b
.octa 0x59f111f159f111f159f111f159f111f1
.octa 0x59f111f159f111f159f111f159f111f1
.octa 0x923f82a4923f82a4923f82a4923f82a4
.octa 0x923f82a4923f82a4923f82a4923f82a4
.octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
.octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
.octa 0xd807aa98d807aa98d807aa98d807aa98
.octa 0xd807aa98d807aa98d807aa98d807aa98
.octa 0x12835b0112835b0112835b0112835b01
.octa 0x12835b0112835b0112835b0112835b01
.octa 0x243185be243185be243185be243185be
.octa 0x243185be243185be243185be243185be
.octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
.octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
.octa 0x72be5d7472be5d7472be5d7472be5d74
.octa 0x72be5d7472be5d7472be5d7472be5d74
.octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
.octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
.octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
.octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
.octa 0xc19bf174c19bf174c19bf174c19bf174
.octa 0xc19bf174c19bf174c19bf174c19bf174
.octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
.octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
.octa 0xefbe4786efbe4786efbe4786efbe4786
.octa 0xefbe4786efbe4786efbe4786efbe4786
.octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
.octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
.octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
.octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
.octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
.octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
.octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
.octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
.octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
.octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
.octa 0x76f988da76f988da76f988da76f988da
.octa 0x76f988da76f988da76f988da76f988da
.octa 0x983e5152983e5152983e5152983e5152
.octa 0x983e5152983e5152983e5152983e5152
.octa 0xa831c66da831c66da831c66da831c66d
.octa 0xa831c66da831c66da831c66da831c66d
.octa 0xb00327c8b00327c8b00327c8b00327c8
.octa 0xb00327c8b00327c8b00327c8b00327c8
.octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
.octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
.octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
.octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
.octa 0xd5a79147d5a79147d5a79147d5a79147
.octa 0xd5a79147d5a79147d5a79147d5a79147
.octa 0x06ca635106ca635106ca635106ca6351
.octa 0x06ca635106ca635106ca635106ca6351
.octa 0x14292967142929671429296714292967
.octa 0x14292967142929671429296714292967
.octa 0x27b70a8527b70a8527b70a8527b70a85
.octa 0x27b70a8527b70a8527b70a8527b70a85
.octa 0x2e1b21382e1b21382e1b21382e1b2138
.octa 0x2e1b21382e1b21382e1b21382e1b2138
.octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
.octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
.octa 0x53380d1353380d1353380d1353380d13
.octa 0x53380d1353380d1353380d1353380d13
.octa 0x650a7354650a7354650a7354650a7354
.octa 0x650a7354650a7354650a7354650a7354
.octa 0x766a0abb766a0abb766a0abb766a0abb
.octa 0x766a0abb766a0abb766a0abb766a0abb
.octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
.octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
.octa 0x92722c8592722c8592722c8592722c85
.octa 0x92722c8592722c8592722c8592722c85
.octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
.octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
.octa 0xa81a664ba81a664ba81a664ba81a664b
.octa 0xa81a664ba81a664ba81a664ba81a664b
.octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
.octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
.octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
.octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
.octa 0xd192e819d192e819d192e819d192e819
.octa 0xd192e819d192e819d192e819d192e819
.octa 0xd6990624d6990624d6990624d6990624
.octa 0xd6990624d6990624d6990624d6990624
.octa 0xf40e3585f40e3585f40e3585f40e3585
.octa 0xf40e3585f40e3585f40e3585f40e3585
.octa 0x106aa070106aa070106aa070106aa070
.octa 0x106aa070106aa070106aa070106aa070
.octa 0x19a4c11619a4c11619a4c11619a4c116
.octa 0x19a4c11619a4c11619a4c11619a4c116
.octa 0x1e376c081e376c081e376c081e376c08
.octa 0x1e376c081e376c081e376c081e376c08
.octa 0x2748774c2748774c2748774c2748774c
.octa 0x2748774c2748774c2748774c2748774c
.octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
.octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
.octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
.octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
.octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
.octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
.octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
.octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
.octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
.octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
.octa 0x748f82ee748f82ee748f82ee748f82ee
.octa 0x748f82ee748f82ee748f82ee748f82ee
.octa 0x78a5636f78a5636f78a5636f78a5636f
.octa 0x78a5636f78a5636f78a5636f78a5636f
.octa 0x84c8781484c8781484c8781484c87814
.octa 0x84c8781484c8781484c8781484c87814
.octa 0x8cc702088cc702088cc702088cc70208
.octa 0x8cc702088cc702088cc702088cc70208
.octa 0x90befffa90befffa90befffa90befffa
.octa 0x90befffa90befffa90befffa90befffa
.octa 0xa4506ceba4506ceba4506ceba4506ceb
.octa 0xa4506ceba4506ceba4506ceba4506ceb
.octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
.octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
.octa 0xc67178f2c67178f2c67178f2c67178f2
.octa 0xc67178f2c67178f2c67178f2c67178f2
.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
.align 32
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203
.octa 0x0c0d0e0f08090a0b0405060700010203
.section .rodata.cst256.K256, "aM", @progbits, 256
.align 64
.global K256
K256:
.int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

View File

@ -1,12 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# Arch-specific CryptoAPI modules.
#
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o
sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \
sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o
endif

File diff suppressed because it is too large Load Diff

View File

@ -1,128 +0,0 @@
/*
* Header file for multi buffer SHA512 context
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SHA_MB_CTX_INTERNAL_H
#define _SHA_MB_CTX_INTERNAL_H
#include "sha512_mb_mgr.h"
#define HASH_UPDATE 0x00
#define HASH_LAST 0x01
#define HASH_DONE 0x02
#define HASH_FINAL 0x04
#define HASH_CTX_STS_IDLE 0x00
#define HASH_CTX_STS_PROCESSING 0x01
#define HASH_CTX_STS_LAST 0x02
#define HASH_CTX_STS_COMPLETE 0x04
enum hash_ctx_error {
HASH_CTX_ERROR_NONE = 0,
HASH_CTX_ERROR_INVALID_FLAGS = -1,
HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
};
#define hash_ctx_user_data(ctx) ((ctx)->user_data)
#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
#define hash_ctx_status(ctx) ((ctx)->status)
#define hash_ctx_error(ctx) ((ctx)->error)
#define hash_ctx_init(ctx) \
do { \
(ctx)->error = HASH_CTX_ERROR_NONE; \
(ctx)->status = HASH_CTX_STS_COMPLETE; \
} while (0)
/* Hash Constants and Typedefs */
#define SHA512_DIGEST_LENGTH 8
#define SHA512_LOG2_BLOCK_SIZE 7
#define SHA512_PADLENGTHFIELD_SIZE 16
#ifdef SHA_MB_DEBUG
#define assert(expr) \
do { \
if (unlikely(!(expr))) { \
printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
#expr, __FILE__, __func__, __LINE__); \
} \
} while (0)
#else
#define assert(expr) do {} while (0)
#endif
struct sha512_ctx_mgr {
struct sha512_mb_mgr mgr;
};
/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */
struct sha512_hash_ctx {
/* Must be at struct offset 0 */
struct job_sha512 job;
/* status flag */
int status;
/* error flag */
int error;
uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2];
uint32_t partial_block_buffer_length;
void *user_data;
};
#endif

View File

@ -1,104 +0,0 @@
/*
* Header file for multi buffer SHA512 algorithm manager
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SHA_MB_MGR_H
#define __SHA_MB_MGR_H
#include <linux/types.h>
#define NUM_SHA512_DIGEST_WORDS 8
enum job_sts {STS_UNKNOWN = 0,
STS_BEING_PROCESSED = 1,
STS_COMPLETED = 2,
STS_INTERNAL_ERROR = 3,
STS_ERROR = 4
};
struct job_sha512 {
u8 *buffer;
u64 len;
u64 result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32);
enum job_sts status;
void *user_data;
};
struct sha512_args_x4 {
uint64_t digest[8][4];
uint8_t *data_ptr[4];
};
struct sha512_lane_data {
struct job_sha512 *job_in_lane;
};
struct sha512_mb_mgr {
struct sha512_args_x4 args;
uint64_t lens[4];
/* each byte is index (0...7) of unused lanes */
uint64_t unused_lanes;
/* byte 4 is set to FF as a flag */
struct sha512_lane_data ldata[4];
};
#define SHA512_MB_MGR_NUM_LANES_AVX2 4
void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state);
struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state,
struct job_sha512 *job);
struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state);
struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state);
#endif

View File

@ -1,281 +0,0 @@
/*
* Header file for multi buffer SHA256 algorithm data structure
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
# Macros for defining data structures
# Usage example
#START_FIELDS # JOB_AES
### name size align
#FIELD _plaintext, 8, 8 # pointer to plaintext
#FIELD _ciphertext, 8, 8 # pointer to ciphertext
#FIELD _IV, 16, 8 # IV
#FIELD _keys, 8, 8 # pointer to keys
#FIELD _len, 4, 4 # length in bytes
#FIELD _status, 4, 4 # status enumeration
#FIELD _user_data, 8, 8 # pointer to user data
#UNION _union, size1, align1, \
# size2, align2, \
# size3, align3, \
# ...
#END_FIELDS
#%assign _JOB_AES_size _FIELD_OFFSET
#%assign _JOB_AES_align _STRUCT_ALIGN
#########################################################################
# Alternate "struc-like" syntax:
# STRUCT job_aes2
# RES_Q .plaintext, 1
# RES_Q .ciphertext, 1
# RES_DQ .IV, 1
# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
# RES_U .union, size1, align1, \
# size2, align2, \
# ...
# ENDSTRUCT
# # Following only needed if nesting
# %assign job_aes2_size _FIELD_OFFSET
# %assign job_aes2_align _STRUCT_ALIGN
#
# RES_* macros take a name, a count and an optional alignment.
# The count in in terms of the base size of the macro, and the
# default alignment is the base size.
# The macros are:
# Macro Base size
# RES_B 1
# RES_W 2
# RES_D 4
# RES_Q 8
# RES_DQ 16
# RES_Y 32
# RES_Z 64
#
# RES_U defines a union. It's arguments are a name and two or more
# pairs of "size, alignment"
#
# The two assigns are only needed if this structure is being nested
# within another. Even if the assigns are not done, one can still use
# STRUCT_NAME_size as the size of the structure.
#
# Note that for nesting, you still need to assign to STRUCT_NAME_size.
#
# The differences between this and using "struc" directly are that each
# type is implicitly aligned to its natural length (although this can be
# over-ridden with an explicit third parameter), and that the structure
# is padded at the end to its overall alignment.
#
#########################################################################
#ifndef _DATASTRUCT_ASM_
#define _DATASTRUCT_ASM_
#define PTR_SZ 8
#define SHA512_DIGEST_WORD_SIZE 8
#define SHA512_MB_MGR_NUM_LANES_AVX2 4
#define NUM_SHA512_DIGEST_WORDS 8
#define SZ4 4*SHA512_DIGEST_WORD_SIZE
#define ROUNDS 80*SZ4
#define SHA512_DIGEST_ROW_SIZE (SHA512_MB_MGR_NUM_LANES_AVX2 * 8)
# START_FIELDS
.macro START_FIELDS
_FIELD_OFFSET = 0
_STRUCT_ALIGN = 0
.endm
# FIELD name size align
.macro FIELD name size align
_FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
\name = _FIELD_OFFSET
_FIELD_OFFSET = _FIELD_OFFSET + (\size)
.if (\align > _STRUCT_ALIGN)
_STRUCT_ALIGN = \align
.endif
.endm
# END_FIELDS
.macro END_FIELDS
_FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
.endm
.macro STRUCT p1
START_FIELDS
.struc \p1
.endm
.macro ENDSTRUCT
tmp = _FIELD_OFFSET
END_FIELDS
tmp = (_FIELD_OFFSET - ##tmp)
.if (tmp > 0)
.lcomm tmp
.endm
## RES_int name size align
.macro RES_int p1 p2 p3
name = \p1
size = \p2
align = .\p3
_FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
.align align
.lcomm name size
_FIELD_OFFSET = _FIELD_OFFSET + (size)
.if (align > _STRUCT_ALIGN)
_STRUCT_ALIGN = align
.endif
.endm
# macro RES_B name, size [, align]
.macro RES_B _name, _size, _align=1
RES_int _name _size _align
.endm
# macro RES_W name, size [, align]
.macro RES_W _name, _size, _align=2
RES_int _name 2*(_size) _align
.endm
# macro RES_D name, size [, align]
.macro RES_D _name, _size, _align=4
RES_int _name 4*(_size) _align
.endm
# macro RES_Q name, size [, align]
.macro RES_Q _name, _size, _align=8
RES_int _name 8*(_size) _align
.endm
# macro RES_DQ name, size [, align]
.macro RES_DQ _name, _size, _align=16
RES_int _name 16*(_size) _align
.endm
# macro RES_Y name, size [, align]
.macro RES_Y _name, _size, _align=32
RES_int _name 32*(_size) _align
.endm
# macro RES_Z name, size [, align]
.macro RES_Z _name, _size, _align=64
RES_int _name 64*(_size) _align
.endm
#endif
###################################################################
### Define SHA512 Out Of Order Data Structures
###################################################################
START_FIELDS # LANE_DATA
### name size align
FIELD _job_in_lane, 8, 8 # pointer to job object
END_FIELDS
_LANE_DATA_size = _FIELD_OFFSET
_LANE_DATA_align = _STRUCT_ALIGN
####################################################################
START_FIELDS # SHA512_ARGS_X4
### name size align
FIELD _digest, 8*8*4, 4 # transposed digest
FIELD _data_ptr, 8*4, 8 # array of pointers to data
END_FIELDS
_SHA512_ARGS_X4_size = _FIELD_OFFSET
_SHA512_ARGS_X4_align = _STRUCT_ALIGN
#####################################################################
START_FIELDS # MB_MGR
### name size align
FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
FIELD _lens, 8*4, 8
FIELD _unused_lanes, 8, 8
FIELD _ldata, _LANE_DATA_size*4, _LANE_DATA_align
END_FIELDS
_MB_MGR_size = _FIELD_OFFSET
_MB_MGR_align = _STRUCT_ALIGN
_args_digest = _args + _digest
_args_data_ptr = _args + _data_ptr
#######################################################################
#######################################################################
#### Define constants
#######################################################################
#define STS_UNKNOWN 0
#define STS_BEING_PROCESSED 1
#define STS_COMPLETED 2
#######################################################################
#### Define JOB_SHA512 structure
#######################################################################
START_FIELDS # JOB_SHA512
### name size align
FIELD _buffer, 8, 8 # pointer to buffer
FIELD _len, 8, 8 # length in bytes
FIELD _result_digest, 8*8, 32 # Digest (output)
FIELD _status, 4, 4
FIELD _user_data, 8, 8
END_FIELDS
_JOB_SHA512_size = _FIELD_OFFSET
_JOB_SHA512_align = _STRUCT_ALIGN

View File

@ -1,297 +0,0 @@
/*
* Flush routine for SHA512 multibuffer
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha512_mb_mgr_datastruct.S"
.extern sha512_x4_avx2
# LINUX register definitions
#define arg1 %rdi
#define arg2 %rsi
# idx needs to be other than arg1, arg2, rbx, r12
#define idx %rdx
# Common definitions
#define state arg1
#define job arg2
#define len2 arg2
#define unused_lanes %rbx
#define lane_data %rbx
#define tmp2 %rbx
#define job_rax %rax
#define tmp1 %rax
#define size_offset %rax
#define tmp %rax
#define start_offset %rax
#define tmp3 arg1
#define extra_blocks arg2
#define p arg2
#define tmp4 %r8
#define lens0 %r8
#define lens1 %r9
#define lens2 %r10
#define lens3 %r11
.macro LABEL prefix n
\prefix\n\():
.endm
.macro JNE_SKIP i
jne skip_\i
.endm
.altmacro
.macro SET_OFFSET _offset
offset = \_offset
.endm
.noaltmacro
# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
# arg 1 : rcx : state
ENTRY(sha512_mb_mgr_flush_avx2)
FRAME_BEGIN
push %rbx
# If bit (32+3) is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
bt $32+7, unused_lanes
jc return_null
# find a lane with a non-null job
xor idx, idx
offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne one(%rip), idx
offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne two(%rip), idx
offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
cmovne three(%rip), idx
# copy idx to empty lanes
copy_lane_data:
offset = (_args + _data_ptr)
mov offset(state,idx,8), tmp
I = 0
.rep 4
offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
cmpq $0, offset(state)
.altmacro
JNE_SKIP %I
offset = (_args + _data_ptr + 8*I)
mov tmp, offset(state)
offset = (_lens + 8*I +4)
movl $0xFFFFFFFF, offset(state)
LABEL skip_ %I
I = (I+1)
.noaltmacro
.endr
# Find min length
mov _lens + 0*8(state),lens0
mov lens0,idx
mov _lens + 1*8(state),lens1
cmp idx,lens1
cmovb lens1,idx
mov _lens + 2*8(state),lens2
cmp idx,lens2
cmovb lens2,idx
mov _lens + 3*8(state),lens3
cmp idx,lens3
cmovb lens3,idx
mov idx,len2
and $0xF,idx
and $~0xFF,len2
jz len_is_0
sub len2, lens0
sub len2, lens1
sub len2, lens2
sub len2, lens3
shr $32,len2
mov lens0, _lens + 0*8(state)
mov lens1, _lens + 1*8(state)
mov lens2, _lens + 2*8(state)
mov lens3, _lens + 3*8(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha512_x4_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $8, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens+4(state, idx, 8)
vmovq _args_digest+0*32(state, idx, 8), %xmm0
vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
vmovq _args_digest+2*32(state, idx, 8), %xmm1
vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
vmovq _args_digest+4*32(state, idx, 8), %xmm2
vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
vmovq _args_digest+6*32(state, idx, 8), %xmm3
vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
vmovdqu %xmm0, _result_digest(job_rax)
vmovdqu %xmm1, _result_digest+1*16(job_rax)
vmovdqu %xmm2, _result_digest+2*16(job_rax)
vmovdqu %xmm3, _result_digest+3*16(job_rax)
return:
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha512_mb_mgr_flush_avx2)
.align 16
ENTRY(sha512_mb_mgr_get_comp_job_avx2)
push %rbx
mov _unused_lanes(state), unused_lanes
bt $(32+7), unused_lanes
jc .return_null
# Find min length
mov _lens(state),lens0
mov lens0,idx
mov _lens+1*8(state),lens1
cmp idx,lens1
cmovb lens1,idx
mov _lens+2*8(state),lens2
cmp idx,lens2
cmovb lens2,idx
mov _lens+3*8(state),lens3
cmp idx,lens3
cmovb lens3,idx
test $~0xF,idx
jnz .return_null
and $0xF,idx
#process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
mov _unused_lanes(state), unused_lanes
shl $8, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF, _lens+4(state, idx, 8)
vmovq _args_digest(state, idx, 8), %xmm0
vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
vmovq _args_digest+2*32(state, idx, 8), %xmm1
vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
vmovq _args_digest+4*32(state, idx, 8), %xmm2
vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
vmovq _args_digest+6*32(state, idx, 8), %xmm3
vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
vmovdqu %xmm0, _result_digest+0*16(job_rax)
vmovdqu %xmm1, _result_digest+1*16(job_rax)
vmovdqu %xmm2, _result_digest+2*16(job_rax)
vmovdqu %xmm3, _result_digest+3*16(job_rax)
pop %rbx
ret
.return_null:
xor job_rax, job_rax
pop %rbx
ret
ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
.section .rodata.cst8.one, "aM", @progbits, 8
.align 8
one:
.quad 1
.section .rodata.cst8.two, "aM", @progbits, 8
.align 8
two:
.quad 2
.section .rodata.cst8.three, "aM", @progbits, 8
.align 8
three:
.quad 3

View File

@ -1,69 +0,0 @@
/*
* Initialization code for multi buffer SHA256 algorithm for AVX2
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "sha512_mb_mgr.h"
void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
{
unsigned int j;
/* initially all lanes are unused */
state->lens[0] = 0xFFFFFFFF00000000;
state->lens[1] = 0xFFFFFFFF00000001;
state->lens[2] = 0xFFFFFFFF00000002;
state->lens[3] = 0xFFFFFFFF00000003;
state->unused_lanes = 0xFF03020100;
for (j = 0; j < 4; j++)
state->ldata[j].job_in_lane = NULL;
}

View File

@ -1,224 +0,0 @@
/*
* Buffer submit code for multi buffer SHA512 algorithm
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#include "sha512_mb_mgr_datastruct.S"
.extern sha512_x4_avx2
#define arg1 %rdi
#define arg2 %rsi
#define idx %rdx
#define last_len %rdx
#define size_offset %rcx
#define tmp2 %rcx
# Common definitions
#define state arg1
#define job arg2
#define len2 arg2
#define p2 arg2
#define p %r11
#define start_offset %r11
#define unused_lanes %rbx
#define job_rax %rax
#define len %rax
#define lane %r12
#define tmp3 %r12
#define lens3 %r12
#define extra_blocks %r8
#define lens0 %r8
#define tmp %r9
#define lens1 %r9
#define lane_data %r10
#define lens2 %r10
#define DWORD_len %eax
# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
# arg 1 : rcx : state
# arg 2 : rdx : job
ENTRY(sha512_mb_mgr_submit_avx2)
FRAME_BEGIN
push %rbx
push %r12
mov _unused_lanes(state), unused_lanes
movzb %bl,lane
shr $8, unused_lanes
imul $_LANE_DATA_size, lane,lane_data
movl $STS_BEING_PROCESSED, _status(job)
lea _ldata(state, lane_data), lane_data
mov unused_lanes, _unused_lanes(state)
movl _len(job), DWORD_len
mov job, _job_in_lane(lane_data)
movl DWORD_len,_lens+4(state , lane, 8)
# Load digest words from result_digest
vmovdqu _result_digest+0*16(job), %xmm0
vmovdqu _result_digest+1*16(job), %xmm1
vmovdqu _result_digest+2*16(job), %xmm2
vmovdqu _result_digest+3*16(job), %xmm3
vmovq %xmm0, _args_digest(state, lane, 8)
vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8)
vmovq %xmm1, _args_digest+2*32(state , lane, 8)
vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8)
vmovq %xmm2, _args_digest+4*32(state , lane, 8)
vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8)
vmovq %xmm3, _args_digest+6*32(state , lane, 8)
vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8)
mov _buffer(job), p
mov p, _args_data_ptr(state, lane, 8)
cmp $0xFF, unused_lanes
jne return_null
start_loop:
# Find min length
mov _lens+0*8(state),lens0
mov lens0,idx
mov _lens+1*8(state),lens1
cmp idx,lens1
cmovb lens1, idx
mov _lens+2*8(state),lens2
cmp idx,lens2
cmovb lens2,idx
mov _lens+3*8(state),lens3
cmp idx,lens3
cmovb lens3,idx
mov idx,len2
and $0xF,idx
and $~0xFF,len2
jz len_is_0
sub len2,lens0
sub len2,lens1
sub len2,lens2
sub len2,lens3
shr $32,len2
mov lens0, _lens + 0*8(state)
mov lens1, _lens + 1*8(state)
mov lens2, _lens + 2*8(state)
mov lens3, _lens + 3*8(state)
# "state" and "args" are the same address, arg1
# len is arg2
call sha512_x4_avx2
# state and idx are intact
len_is_0:
# process completed job "idx"
imul $_LANE_DATA_size, idx, lane_data
lea _ldata(state, lane_data), lane_data
mov _job_in_lane(lane_data), job_rax
mov _unused_lanes(state), unused_lanes
movq $0, _job_in_lane(lane_data)
movl $STS_COMPLETED, _status(job_rax)
shl $8, unused_lanes
or idx, unused_lanes
mov unused_lanes, _unused_lanes(state)
movl $0xFFFFFFFF,_lens+4(state,idx,8)
vmovq _args_digest+0*32(state , idx, 8), %xmm0
vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
vmovq _args_digest+2*32(state , idx, 8), %xmm1
vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
vmovq _args_digest+4*32(state , idx, 8), %xmm2
vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
vmovq _args_digest+6*32(state , idx, 8), %xmm3
vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
vmovdqu %xmm0, _result_digest + 0*16(job_rax)
vmovdqu %xmm1, _result_digest + 1*16(job_rax)
vmovdqu %xmm2, _result_digest + 2*16(job_rax)
vmovdqu %xmm3, _result_digest + 3*16(job_rax)
return:
pop %r12
pop %rbx
FRAME_END
ret
return_null:
xor job_rax, job_rax
jmp return
ENDPROC(sha512_mb_mgr_submit_avx2)
/* UNUSED?
.section .rodata.cst16, "aM", @progbits, 16
.align 16
H0: .int 0x6a09e667
H1: .int 0xbb67ae85
H2: .int 0x3c6ef372
H3: .int 0xa54ff53a
H4: .int 0x510e527f
H5: .int 0x9b05688c
H6: .int 0x1f83d9ab
H7: .int 0x5be0cd19
*/

View File

@ -1,531 +0,0 @@
/*
* Multi-buffer SHA512 algorithm hash compute routine
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
*
* GPL LICENSE SUMMARY
*
* Copyright(c) 2016 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Contact Information:
* Megha Dey <megha.dey@linux.intel.com>
*
* BSD LICENSE
*
* Copyright(c) 2016 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
# code to compute quad SHA512 using AVX2
# use YMMs to tackle the larger digest size
# outer calling routine takes care of save and restore of XMM registers
# Logic designed/laid out by JDG
# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
# Stack must be aligned to 32 bytes before call
# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
# Linux preserves: rcx rdx rdi rbp r13 r14 r15
# clobbers ymm0-15
#include <linux/linkage.h>
#include "sha512_mb_mgr_datastruct.S"
arg1 = %rdi
arg2 = %rsi
# Common definitions
STATE = arg1
INP_SIZE = arg2
IDX = %rax
ROUND = %rbx
TBL = %r8
inp0 = %r9
inp1 = %r10
inp2 = %r11
inp3 = %r12
a = %ymm0
b = %ymm1
c = %ymm2
d = %ymm3
e = %ymm4
f = %ymm5
g = %ymm6
h = %ymm7
a0 = %ymm8
a1 = %ymm9
a2 = %ymm10
TT0 = %ymm14
TT1 = %ymm13
TT2 = %ymm12
TT3 = %ymm11
TT4 = %ymm10
TT5 = %ymm9
T1 = %ymm14
TMP = %ymm15
# Define stack usage
STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24
#define VMOVPD vmovupd
_digest = SZ4*16
# transpose r0, r1, r2, r3, t0, t1
# "transpose" data in {r0..r3} using temps {t0..t3}
# Input looks like: {r0 r1 r2 r3}
# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
#
# output looks like: {t0 r1 r0 r3}
# t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
# r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
# r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
# r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
.macro TRANSPOSE r0 r1 r2 r3 t0 t1
vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
vperm2f128 $0x20, \r2, \r0, \r1 # h6...a6
vperm2f128 $0x31, \r2, \r0, \r3 # h2...a2
vperm2f128 $0x31, \t1, \t0, \r0 # h5...a5
vperm2f128 $0x20, \t1, \t0, \t0 # h1...a1
.endm
.macro ROTATE_ARGS
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = TMP_
.endm
# PRORQ reg, imm, tmp
# packed-rotate-right-double
# does a rotate by doing two shifts and an or
.macro _PRORQ reg imm tmp
vpsllq $(64-\imm),\reg,\tmp
vpsrlq $\imm,\reg, \reg
vpor \tmp,\reg, \reg
.endm
# non-destructive
# PRORQ_nd reg, imm, tmp, src
.macro _PRORQ_nd reg imm tmp src
vpsllq $(64-\imm), \src, \tmp
vpsrlq $\imm, \src, \reg
vpor \tmp, \reg, \reg
.endm
# PRORQ dst/src, amt
.macro PRORQ reg imm
_PRORQ \reg, \imm, TMP
.endm
# PRORQ_nd dst, src, amt
.macro PRORQ_nd reg tmp imm
_PRORQ_nd \reg, \imm, TMP, \tmp
.endm
#; arguments passed implicitly in preprocessor symbols i, a...h
.macro ROUND_00_15 _T1 i
PRORQ_nd a0, e, (18-14) # sig1: a0 = (e >> 4)
vpxor g, f, a2 # ch: a2 = f^g
vpand e,a2, a2 # ch: a2 = (f^g)&e
vpxor g, a2, a2 # a2 = ch
PRORQ_nd a1,e,41 # sig1: a1 = (e >> 25)
offset = SZ4*(\i & 0xf)
vmovdqu \_T1,offset(%rsp)
vpaddq (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
PRORQ a0, 14 # sig1: a0 = (e >> 6) ^ (e >> 11)
vpaddq a2, h, h # h = h + ch
PRORQ_nd a2,a,6 # sig0: a2 = (a >> 11)
vpaddq \_T1,h, h # h = h + ch + W + K
vpxor a1, a0, a0 # a0 = sigma1
vmovdqu a,\_T1
PRORQ_nd a1,a,39 # sig0: a1 = (a >> 22)
vpxor c, \_T1, \_T1 # maj: T1 = a^c
add $SZ4, ROUND # ROUND++
vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
vpaddq a0, h, h
vpaddq h, d, d
vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
PRORQ a2,28 # sig0: a2 = (a >> 2) ^ (a >> 13)
vpxor a1, a2, a2 # a2 = sig0
vpand c, a, a1 # maj: a1 = a&c
vpor \_T1, a1, a1 # a1 = maj
vpaddq a1, h, h # h = h + ch + W + K + maj
vpaddq a2, h, h # h = h + ch + W + K + maj + sigma0
ROTATE_ARGS
.endm
#; arguments passed implicitly in preprocessor symbols i, a...h
.macro ROUND_16_XX _T1 i
vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1
vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1
vmovdqu \_T1, a0
PRORQ \_T1,7
vmovdqu a1, a2
PRORQ a1,42
vpxor a0, \_T1, \_T1
PRORQ \_T1, 1
vpxor a2, a1, a1
PRORQ a1, 19
vpsrlq $7, a0, a0
vpxor a0, \_T1, \_T1
vpsrlq $6, a2, a2
vpxor a2, a1, a1
vpaddq SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1
vpaddq SZ4*((\i-7)&0xf)(%rsp), a1, a1
vpaddq a1, \_T1, \_T1
ROUND_00_15 \_T1,\i
.endm
# void sha512_x4_avx2(void *STATE, const int INP_SIZE)
# arg 1 : STATE : pointer to input data
# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
ENTRY(sha512_x4_avx2)
# general registers preserved in outer calling routine
# outer calling routine saves all the XMM registers
# save callee-saved clobbered registers to comply with C function ABI
push %r12
push %r13
push %r14
push %r15
sub $STACK_SPACE1, %rsp
# Load the pre-transposed incoming digest.
vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a
vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b
vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c
vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d
vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e
vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f
vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g
vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h
lea K512_4(%rip),TBL
# load the address of each of the 4 message lanes
# getting ready to transpose input onto stack
mov _data_ptr+0*PTR_SZ(STATE),inp0
mov _data_ptr+1*PTR_SZ(STATE),inp1
mov _data_ptr+2*PTR_SZ(STATE),inp2
mov _data_ptr+3*PTR_SZ(STATE),inp3
xor IDX, IDX
lloop:
xor ROUND, ROUND
# save old digest
vmovdqu a, _digest(%rsp)
vmovdqu b, _digest+1*SZ4(%rsp)
vmovdqu c, _digest+2*SZ4(%rsp)
vmovdqu d, _digest+3*SZ4(%rsp)
vmovdqu e, _digest+4*SZ4(%rsp)
vmovdqu f, _digest+5*SZ4(%rsp)
vmovdqu g, _digest+6*SZ4(%rsp)
vmovdqu h, _digest+7*SZ4(%rsp)
i = 0
.rep 4
vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP
VMOVPD i*32(inp0, IDX), TT2
VMOVPD i*32(inp1, IDX), TT1
VMOVPD i*32(inp2, IDX), TT4
VMOVPD i*32(inp3, IDX), TT3
TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
vpshufb TMP, TT0, TT0
vpshufb TMP, TT1, TT1
vpshufb TMP, TT2, TT2
vpshufb TMP, TT3, TT3
ROUND_00_15 TT0,(i*4+0)
ROUND_00_15 TT1,(i*4+1)
ROUND_00_15 TT2,(i*4+2)
ROUND_00_15 TT3,(i*4+3)
i = (i+1)
.endr
add $128, IDX
i = (i*4)
jmp Lrounds_16_xx
.align 16
Lrounds_16_xx:
.rep 16
ROUND_16_XX T1, i
i = (i+1)
.endr
cmp $0xa00,ROUND
jb Lrounds_16_xx
# add old digest
vpaddq _digest(%rsp), a, a
vpaddq _digest+1*SZ4(%rsp), b, b
vpaddq _digest+2*SZ4(%rsp), c, c
vpaddq _digest+3*SZ4(%rsp), d, d
vpaddq _digest+4*SZ4(%rsp), e, e
vpaddq _digest+5*SZ4(%rsp), f, f
vpaddq _digest+6*SZ4(%rsp), g, g
vpaddq _digest+7*SZ4(%rsp), h, h
sub $1, INP_SIZE # unit is blocks
jne lloop
# write back to memory (state object) the transposed digest
vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE)
vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE)
# update input data pointers
add IDX, inp0
mov inp0, _data_ptr+0*PTR_SZ(STATE)
add IDX, inp1
mov inp1, _data_ptr+1*PTR_SZ(STATE)
add IDX, inp2
mov inp2, _data_ptr+2*PTR_SZ(STATE)
add IDX, inp3
mov inp3, _data_ptr+3*PTR_SZ(STATE)
#;;;;;;;;;;;;;;;
#; Postamble
add $STACK_SPACE1, %rsp
# restore callee-saved clobbered registers
pop %r15
pop %r14
pop %r13
pop %r12
# outer calling routine restores XMM and other GP registers
ret
ENDPROC(sha512_x4_avx2)
.section .rodata.K512_4, "a", @progbits
.align 64
K512_4:
.octa 0x428a2f98d728ae22428a2f98d728ae22,\
0x428a2f98d728ae22428a2f98d728ae22
.octa 0x7137449123ef65cd7137449123ef65cd,\
0x7137449123ef65cd7137449123ef65cd
.octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\
0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f
.octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\
0xe9b5dba58189dbbce9b5dba58189dbbc
.octa 0x3956c25bf348b5383956c25bf348b538,\
0x3956c25bf348b5383956c25bf348b538
.octa 0x59f111f1b605d01959f111f1b605d019,\
0x59f111f1b605d01959f111f1b605d019
.octa 0x923f82a4af194f9b923f82a4af194f9b,\
0x923f82a4af194f9b923f82a4af194f9b
.octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\
0xab1c5ed5da6d8118ab1c5ed5da6d8118
.octa 0xd807aa98a3030242d807aa98a3030242,\
0xd807aa98a3030242d807aa98a3030242
.octa 0x12835b0145706fbe12835b0145706fbe,\
0x12835b0145706fbe12835b0145706fbe
.octa 0x243185be4ee4b28c243185be4ee4b28c,\
0x243185be4ee4b28c243185be4ee4b28c
.octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\
0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2
.octa 0x72be5d74f27b896f72be5d74f27b896f,\
0x72be5d74f27b896f72be5d74f27b896f
.octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\
0x80deb1fe3b1696b180deb1fe3b1696b1
.octa 0x9bdc06a725c712359bdc06a725c71235,\
0x9bdc06a725c712359bdc06a725c71235
.octa 0xc19bf174cf692694c19bf174cf692694,\
0xc19bf174cf692694c19bf174cf692694
.octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\
0xe49b69c19ef14ad2e49b69c19ef14ad2
.octa 0xefbe4786384f25e3efbe4786384f25e3,\
0xefbe4786384f25e3efbe4786384f25e3
.octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\
0x0fc19dc68b8cd5b50fc19dc68b8cd5b5
.octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\
0x240ca1cc77ac9c65240ca1cc77ac9c65
.octa 0x2de92c6f592b02752de92c6f592b0275,\
0x2de92c6f592b02752de92c6f592b0275
.octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\
0x4a7484aa6ea6e4834a7484aa6ea6e483
.octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\
0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4
.octa 0x76f988da831153b576f988da831153b5,\
0x76f988da831153b576f988da831153b5
.octa 0x983e5152ee66dfab983e5152ee66dfab,\
0x983e5152ee66dfab983e5152ee66dfab
.octa 0xa831c66d2db43210a831c66d2db43210,\
0xa831c66d2db43210a831c66d2db43210
.octa 0xb00327c898fb213fb00327c898fb213f,\
0xb00327c898fb213fb00327c898fb213f
.octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\
0xbf597fc7beef0ee4bf597fc7beef0ee4
.octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\
0xc6e00bf33da88fc2c6e00bf33da88fc2
.octa 0xd5a79147930aa725d5a79147930aa725,\
0xd5a79147930aa725d5a79147930aa725
.octa 0x06ca6351e003826f06ca6351e003826f,\
0x06ca6351e003826f06ca6351e003826f
.octa 0x142929670a0e6e70142929670a0e6e70,\
0x142929670a0e6e70142929670a0e6e70
.octa 0x27b70a8546d22ffc27b70a8546d22ffc,\
0x27b70a8546d22ffc27b70a8546d22ffc
.octa 0x2e1b21385c26c9262e1b21385c26c926,\
0x2e1b21385c26c9262e1b21385c26c926
.octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\
0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed
.octa 0x53380d139d95b3df53380d139d95b3df,\
0x53380d139d95b3df53380d139d95b3df
.octa 0x650a73548baf63de650a73548baf63de,\
0x650a73548baf63de650a73548baf63de
.octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\
0x766a0abb3c77b2a8766a0abb3c77b2a8
.octa 0x81c2c92e47edaee681c2c92e47edaee6,\
0x81c2c92e47edaee681c2c92e47edaee6
.octa 0x92722c851482353b92722c851482353b,\
0x92722c851482353b92722c851482353b
.octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\
0xa2bfe8a14cf10364a2bfe8a14cf10364
.octa 0xa81a664bbc423001a81a664bbc423001,\
0xa81a664bbc423001a81a664bbc423001
.octa 0xc24b8b70d0f89791c24b8b70d0f89791,\
0xc24b8b70d0f89791c24b8b70d0f89791
.octa 0xc76c51a30654be30c76c51a30654be30,\
0xc76c51a30654be30c76c51a30654be30
.octa 0xd192e819d6ef5218d192e819d6ef5218,\
0xd192e819d6ef5218d192e819d6ef5218
.octa 0xd69906245565a910d69906245565a910,\
0xd69906245565a910d69906245565a910
.octa 0xf40e35855771202af40e35855771202a,\
0xf40e35855771202af40e35855771202a
.octa 0x106aa07032bbd1b8106aa07032bbd1b8,\
0x106aa07032bbd1b8106aa07032bbd1b8
.octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\
0x19a4c116b8d2d0c819a4c116b8d2d0c8
.octa 0x1e376c085141ab531e376c085141ab53,\
0x1e376c085141ab531e376c085141ab53
.octa 0x2748774cdf8eeb992748774cdf8eeb99,\
0x2748774cdf8eeb992748774cdf8eeb99
.octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\
0x34b0bcb5e19b48a834b0bcb5e19b48a8
.octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\
0x391c0cb3c5c95a63391c0cb3c5c95a63
.octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\
0x4ed8aa4ae3418acb4ed8aa4ae3418acb
.octa 0x5b9cca4f7763e3735b9cca4f7763e373,\
0x5b9cca4f7763e3735b9cca4f7763e373
.octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\
0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3
.octa 0x748f82ee5defb2fc748f82ee5defb2fc,\
0x748f82ee5defb2fc748f82ee5defb2fc
.octa 0x78a5636f43172f6078a5636f43172f60,\
0x78a5636f43172f6078a5636f43172f60
.octa 0x84c87814a1f0ab7284c87814a1f0ab72,\
0x84c87814a1f0ab7284c87814a1f0ab72
.octa 0x8cc702081a6439ec8cc702081a6439ec,\
0x8cc702081a6439ec8cc702081a6439ec
.octa 0x90befffa23631e2890befffa23631e28,\
0x90befffa23631e2890befffa23631e28
.octa 0xa4506cebde82bde9a4506cebde82bde9,\
0xa4506cebde82bde9a4506cebde82bde9
.octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\
0xbef9a3f7b2c67915bef9a3f7b2c67915
.octa 0xc67178f2e372532bc67178f2e372532b,\
0xc67178f2e372532bc67178f2e372532b
.octa 0xca273eceea26619cca273eceea26619c,\
0xca273eceea26619cca273eceea26619c
.octa 0xd186b8c721c0c207d186b8c721c0c207,\
0xd186b8c721c0c207d186b8c721c0c207
.octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\
0xeada7dd6cde0eb1eeada7dd6cde0eb1e
.octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\
0xf57d4f7fee6ed178f57d4f7fee6ed178
.octa 0x06f067aa72176fba06f067aa72176fba,\
0x06f067aa72176fba06f067aa72176fba
.octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\
0x0a637dc5a2c898a60a637dc5a2c898a6
.octa 0x113f9804bef90dae113f9804bef90dae,\
0x113f9804bef90dae113f9804bef90dae
.octa 0x1b710b35131c471b1b710b35131c471b,\
0x1b710b35131c471b1b710b35131c471b
.octa 0x28db77f523047d8428db77f523047d84,\
0x28db77f523047d8428db77f523047d84
.octa 0x32caab7b40c7249332caab7b40c72493,\
0x32caab7b40c7249332caab7b40c72493
.octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\
0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc
.octa 0x431d67c49c100d4c431d67c49c100d4c,\
0x431d67c49c100d4c431d67c49c100d4c
.octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\
0x4cc5d4becb3e42b64cc5d4becb3e42b6
.octa 0x597f299cfc657e2a597f299cfc657e2a,\
0x597f299cfc657e2a597f299cfc657e2a
.octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\
0x5fcb6fab3ad6faec5fcb6fab3ad6faec
.octa 0x6c44198c4a4758176c44198c4a475817,\
0x6c44198c4a4758176c44198c4a475817
.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
.align 32
PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
.octa 0x18191a1b1c1d1e1f1011121314151617

View File

@ -213,20 +213,6 @@ config CRYPTO_CRYPTD
converts an arbitrary synchronous software crypto algorithm
into an asynchronous algorithm that executes in a kernel thread.
config CRYPTO_MCRYPTD
tristate "Software async multi-buffer crypto daemon"
select CRYPTO_BLKCIPHER
select CRYPTO_HASH
select CRYPTO_MANAGER
select CRYPTO_WORKQUEUE
help
This is a generic software asynchronous crypto daemon that
provides the kernel thread to assist multi-buffer crypto
algorithms for submitting jobs and flushing jobs in multi-buffer
crypto algorithms. Multi-buffer crypto algorithms are executed
in the context of this kernel thread and drivers can post
their crypto request asynchronously to be processed by this daemon.
config CRYPTO_AUTHENC
tristate "Authenc support"
select CRYPTO_AEAD
@ -470,6 +456,18 @@ config CRYPTO_LRW
The first 128, 192 or 256 bits in the key are used for AES and the
rest is used to tie each cipher block to its logical position.
config CRYPTO_OFB
tristate "OFB support"
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
help
OFB: the Output Feedback mode makes a block cipher into a synchronous
stream cipher. It generates keystream blocks, which are then XORed
with the plaintext blocks to get the ciphertext. Flipping a bit in the
ciphertext produces a flipped bit in the plaintext at the same
location. This property allows many error correcting codes to function
normally even when applied before encryption.
config CRYPTO_PCBC
tristate "PCBC support"
select CRYPTO_BLKCIPHER
@ -848,54 +846,6 @@ config CRYPTO_SHA1_PPC_SPE
SHA-1 secure hash standard (DFIPS 180-4) implemented
using powerpc SPE SIMD instruction set.
config CRYPTO_SHA1_MB
tristate "SHA1 digest algorithm (x86_64 Multi-Buffer, Experimental)"
depends on X86 && 64BIT
select CRYPTO_SHA1
select CRYPTO_HASH
select CRYPTO_MCRYPTD
help
SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
using multi-buffer technique. This algorithm computes on
multiple data lanes concurrently with SIMD instructions for
better throughput. It should not be enabled by default but
used when there is significant amount of work to keep the keep
the data lanes filled to get performance benefit. If the data
lanes remain unfilled, a flush operation will be initiated to
process the crypto jobs, adding a slight latency.
config CRYPTO_SHA256_MB
tristate "SHA256 digest algorithm (x86_64 Multi-Buffer, Experimental)"
depends on X86 && 64BIT
select CRYPTO_SHA256
select CRYPTO_HASH
select CRYPTO_MCRYPTD
help
SHA-256 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
using multi-buffer technique. This algorithm computes on
multiple data lanes concurrently with SIMD instructions for
better throughput. It should not be enabled by default but
used when there is significant amount of work to keep the keep
the data lanes filled to get performance benefit. If the data
lanes remain unfilled, a flush operation will be initiated to
process the crypto jobs, adding a slight latency.
config CRYPTO_SHA512_MB
tristate "SHA512 digest algorithm (x86_64 Multi-Buffer, Experimental)"
depends on X86 && 64BIT
select CRYPTO_SHA512
select CRYPTO_HASH
select CRYPTO_MCRYPTD
help
SHA-512 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
using multi-buffer technique. This algorithm computes on
multiple data lanes concurrently with SIMD instructions for
better throughput. It should not be enabled by default but
used when there is significant amount of work to keep the keep
the data lanes filled to get performance benefit. If the data
lanes remain unfilled, a flush operation will be initiated to
process the crypto jobs, adding a slight latency.
config CRYPTO_SHA256
tristate "SHA224 and SHA256 digest algorithm"
select CRYPTO_HASH
@ -1133,7 +1083,7 @@ config CRYPTO_AES_NI_INTEL
In addition to AES cipher algorithm support, the acceleration
for some popular block cipher mode is supported too, including
ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
ECB, CBC, LRW, XTS. The 64 bit version has additional
acceleration for CTR.
config CRYPTO_AES_SPARC64
@ -1590,20 +1540,6 @@ config CRYPTO_SM4
If unsure, say N.
config CRYPTO_SPECK
tristate "Speck cipher algorithm"
select CRYPTO_ALGAPI
help
Speck is a lightweight block cipher that is tuned for optimal
performance in software (rather than hardware).
Speck may not be as secure as AES, and should only be used on systems
where AES is not fast enough.
See also: <https://eprint.iacr.org/2013/404.pdf>
If unsure, say N.
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
select CRYPTO_ALGAPI
@ -1875,6 +1811,17 @@ config CRYPTO_USER_API_AEAD
This option enables the user-spaces interface for AEAD
cipher algorithms.
config CRYPTO_STATS
bool "Crypto usage statistics for User-space"
help
This option enables the gathering of crypto stats.
This will collect:
- encrypt/decrypt size and numbers of symmeric operations
- compress/decompress size and numbers of compress operations
- size and numbers of hash operations
- encrypt/decrypt/sign/verify numbers for asymmetric operations
- generate/seed numbers for rng operations
config CRYPTO_HASH_INFO
bool

View File

@ -54,6 +54,7 @@ cryptomgr-y := algboss.o testmgr.o
obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
crypto_user-y := crypto_user_base.o crypto_user_stat.o
obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
@ -93,7 +94,6 @@ obj-$(CONFIG_CRYPTO_MORUS640) += morus640.o
obj-$(CONFIG_CRYPTO_MORUS1280) += morus1280.o
obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
obj-$(CONFIG_CRYPTO_MCRYPTD) += mcryptd.o
obj-$(CONFIG_CRYPTO_DES) += des_generic.o
obj-$(CONFIG_CRYPTO_FCRYPT) += fcrypt.o
obj-$(CONFIG_CRYPTO_BLOWFISH) += blowfish_generic.o
@ -115,7 +115,6 @@ obj-$(CONFIG_CRYPTO_TEA) += tea.o
obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
obj-$(CONFIG_CRYPTO_SEED) += seed.o
obj-$(CONFIG_CRYPTO_SPECK) += speck.o
obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
@ -143,6 +142,7 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
obj-$(CONFIG_CRYPTO_OFB) += ofb.o
ecdh_generic-y := ecc.o
ecdh_generic-y += ecdh.o

View File

@ -21,7 +21,7 @@
union aegis_block {
__le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)];
u32 words32[AEGIS_BLOCK_SIZE / sizeof(u32)];
__le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)];
u8 bytes[AEGIS_BLOCK_SIZE];
};
@ -57,24 +57,22 @@ static void crypto_aegis_aesenc(union aegis_block *dst,
const union aegis_block *src,
const union aegis_block *key)
{
u32 *d = dst->words32;
const u8 *s = src->bytes;
const u32 *k = key->words32;
const u32 *t0 = crypto_ft_tab[0];
const u32 *t1 = crypto_ft_tab[1];
const u32 *t2 = crypto_ft_tab[2];
const u32 *t3 = crypto_ft_tab[3];
u32 d0, d1, d2, d3;
d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]] ^ k[0];
d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]] ^ k[1];
d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]] ^ k[2];
d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]] ^ k[3];
d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]];
d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]];
d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]];
d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]];
d[0] = d0;
d[1] = d1;
d[2] = d2;
d[3] = d3;
dst->words32[0] = cpu_to_le32(d0) ^ key->words32[0];
dst->words32[1] = cpu_to_le32(d1) ^ key->words32[1];
dst->words32[2] = cpu_to_le32(d2) ^ key->words32[2];
dst->words32[3] = cpu_to_le32(d3) ^ key->words32[3];
}
#endif /* _CRYPTO_AEGIS_H */

View File

@ -364,24 +364,35 @@ static int crypto_ahash_op(struct ahash_request *req,
int crypto_ahash_final(struct ahash_request *req)
{
return crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final);
int ret;
ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final);
crypto_stat_ahash_final(req, ret);
return ret;
}
EXPORT_SYMBOL_GPL(crypto_ahash_final);
int crypto_ahash_finup(struct ahash_request *req)
{
return crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup);
int ret;
ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup);
crypto_stat_ahash_final(req, ret);
return ret;
}
EXPORT_SYMBOL_GPL(crypto_ahash_finup);
int crypto_ahash_digest(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
int ret;
if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
return -ENOKEY;
return crypto_ahash_op(req, tfm->digest);
ret = -ENOKEY;
else
ret = crypto_ahash_op(req, tfm->digest);
crypto_stat_ahash_final(req, ret);
return ret;
}
EXPORT_SYMBOL_GPL(crypto_ahash_digest);
@ -550,8 +561,8 @@ static int ahash_prepare_alg(struct ahash_alg *alg)
{
struct crypto_alg *base = &alg->halg.base;
if (alg->halg.digestsize > PAGE_SIZE / 8 ||
alg->halg.statesize > PAGE_SIZE / 8 ||
if (alg->halg.digestsize > HASH_MAX_DIGESTSIZE ||
alg->halg.statesize > HASH_MAX_STATESIZE ||
alg->halg.statesize == 0)
return -EINVAL;

View File

@ -57,9 +57,14 @@ static int crypto_check_alg(struct crypto_alg *alg)
if (alg->cra_alignmask & (alg->cra_alignmask + 1))
return -EINVAL;
if (alg->cra_blocksize > PAGE_SIZE / 8)
/* General maximums for all algs. */
if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK)
return -EINVAL;
if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE)
return -EINVAL;
/* Lower maximums for specific alg types. */
if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
CRYPTO_ALG_TYPE_CIPHER) {
if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK)
@ -253,6 +258,14 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
list_add(&alg->cra_list, &crypto_alg_list);
list_add(&larval->alg.cra_list, &crypto_alg_list);
atomic_set(&alg->encrypt_cnt, 0);
atomic_set(&alg->decrypt_cnt, 0);
atomic64_set(&alg->encrypt_tlen, 0);
atomic64_set(&alg->decrypt_tlen, 0);
atomic_set(&alg->verify_cnt, 0);
atomic_set(&alg->cipher_err_cnt, 0);
atomic_set(&alg->sign_cnt, 0);
out:
return larval;
@ -367,6 +380,8 @@ static void crypto_wait_for_test(struct crypto_larval *larval)
err = wait_for_completion_killable(&larval->completion);
WARN_ON(err);
if (!err)
crypto_probing_notify(CRYPTO_MSG_ALG_LOADED, larval);
out:
crypto_larval_kill(&larval->alg);

View File

@ -274,6 +274,8 @@ static int cryptomgr_notify(struct notifier_block *this, unsigned long msg,
return cryptomgr_schedule_probe(data);
case CRYPTO_MSG_ALG_REGISTER:
return cryptomgr_schedule_test(data);
case CRYPTO_MSG_ALG_LOADED:
break;
}
return NOTIFY_DONE;

View File

@ -42,7 +42,7 @@
struct aead_tfm {
struct crypto_aead *aead;
struct crypto_skcipher *null_tfm;
struct crypto_sync_skcipher *null_tfm;
};
static inline bool aead_sufficient_data(struct sock *sk)
@ -75,13 +75,13 @@ static int aead_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
return af_alg_sendmsg(sock, msg, size, ivsize);
}
static int crypto_aead_copy_sgl(struct crypto_skcipher *null_tfm,
static int crypto_aead_copy_sgl(struct crypto_sync_skcipher *null_tfm,
struct scatterlist *src,
struct scatterlist *dst, unsigned int len)
{
SKCIPHER_REQUEST_ON_STACK(skreq, null_tfm);
SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, null_tfm);
skcipher_request_set_tfm(skreq, null_tfm);
skcipher_request_set_sync_tfm(skreq, null_tfm);
skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_BACKLOG,
NULL, NULL);
skcipher_request_set_crypt(skreq, src, dst, len, NULL);
@ -99,7 +99,7 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
struct af_alg_ctx *ctx = ask->private;
struct aead_tfm *aeadc = pask->private;
struct crypto_aead *tfm = aeadc->aead;
struct crypto_skcipher *null_tfm = aeadc->null_tfm;
struct crypto_sync_skcipher *null_tfm = aeadc->null_tfm;
unsigned int i, as = crypto_aead_authsize(tfm);
struct af_alg_async_req *areq;
struct af_alg_tsgl *tsgl, *tmp;
@ -478,7 +478,7 @@ static void *aead_bind(const char *name, u32 type, u32 mask)
{
struct aead_tfm *tfm;
struct crypto_aead *aead;
struct crypto_skcipher *null_tfm;
struct crypto_sync_skcipher *null_tfm;
tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
if (!tfm)

View File

@ -239,7 +239,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags,
struct alg_sock *ask = alg_sk(sk);
struct hash_ctx *ctx = ask->private;
struct ahash_request *req = &ctx->req;
char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req)) ? : 1];
char state[HASH_MAX_STATESIZE];
struct sock *sk2;
struct alg_sock *ask2;
struct hash_ctx *ctx2;

View File

@ -33,7 +33,7 @@ struct authenc_instance_ctx {
struct crypto_authenc_ctx {
struct crypto_ahash *auth;
struct crypto_skcipher *enc;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
};
struct authenc_request_ctx {
@ -185,9 +185,9 @@ static int crypto_authenc_copy_assoc(struct aead_request *req)
{
struct crypto_aead *authenc = crypto_aead_reqtfm(req);
struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc);
SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null);
SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null);
skcipher_request_set_tfm(skreq, ctx->null);
skcipher_request_set_sync_tfm(skreq, ctx->null);
skcipher_request_set_callback(skreq, aead_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(skreq, req->src, req->dst, req->assoclen,
@ -318,7 +318,7 @@ static int crypto_authenc_init_tfm(struct crypto_aead *tfm)
struct crypto_authenc_ctx *ctx = crypto_aead_ctx(tfm);
struct crypto_ahash *auth;
struct crypto_skcipher *enc;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
int err;
auth = crypto_spawn_ahash(&ictx->auth);

View File

@ -36,7 +36,7 @@ struct crypto_authenc_esn_ctx {
unsigned int reqoff;
struct crypto_ahash *auth;
struct crypto_skcipher *enc;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
};
struct authenc_esn_request_ctx {
@ -183,9 +183,9 @@ static int crypto_authenc_esn_copy(struct aead_request *req, unsigned int len)
{
struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req);
struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn);
SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null);
SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null);
skcipher_request_set_tfm(skreq, ctx->null);
skcipher_request_set_sync_tfm(skreq, ctx->null);
skcipher_request_set_callback(skreq, aead_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(skreq, req->src, req->dst, len, NULL);
@ -341,7 +341,7 @@ static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm)
struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm);
struct crypto_ahash *auth;
struct crypto_skcipher *enc;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
int err;
auth = crypto_spawn_ahash(&ictx->auth);

View File

@ -50,7 +50,10 @@ struct crypto_ccm_req_priv_ctx {
u32 flags;
struct scatterlist src[3];
struct scatterlist dst[3];
struct skcipher_request skreq;
union {
struct ahash_request ahreq;
struct skcipher_request skreq;
};
};
struct cbcmac_tfm_ctx {
@ -181,7 +184,7 @@ static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain,
struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
AHASH_REQUEST_ON_STACK(ahreq, ctx->mac);
struct ahash_request *ahreq = &pctx->ahreq;
unsigned int assoclen = req->assoclen;
struct scatterlist sg[3];
u8 *odata = pctx->odata;
@ -427,7 +430,7 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
crypto_aead_set_reqsize(
tfm,
align + sizeof(struct crypto_ccm_req_priv_ctx) +
crypto_skcipher_reqsize(ctr));
max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr)));
return 0;

View File

@ -18,20 +18,21 @@
static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u32 stream[CHACHA20_BLOCK_WORDS];
/* aligned to potentially speed up crypto_xor() */
u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long));
if (dst != src)
memcpy(dst, src, bytes);
while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block(state, stream);
crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE);
crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
bytes -= CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
}
if (bytes) {
chacha20_block(state, stream);
crypto_xor(dst, (const u8 *)stream, bytes);
crypto_xor(dst, stream, bytes);
}
}

View File

@ -76,7 +76,7 @@ struct cryptd_blkcipher_request_ctx {
struct cryptd_skcipher_ctx {
atomic_t refcnt;
struct crypto_skcipher *child;
struct crypto_sync_skcipher *child;
};
struct cryptd_skcipher_request_ctx {
@ -449,14 +449,16 @@ static int cryptd_skcipher_setkey(struct crypto_skcipher *parent,
const u8 *key, unsigned int keylen)
{
struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(parent);
struct crypto_skcipher *child = ctx->child;
struct crypto_sync_skcipher *child = ctx->child;
int err;
crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) &
crypto_sync_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_sync_skcipher_set_flags(child,
crypto_skcipher_get_flags(parent) &
CRYPTO_TFM_REQ_MASK);
err = crypto_skcipher_setkey(child, key, keylen);
crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) &
err = crypto_sync_skcipher_setkey(child, key, keylen);
crypto_skcipher_set_flags(parent,
crypto_sync_skcipher_get_flags(child) &
CRYPTO_TFM_RES_MASK);
return err;
}
@ -483,13 +485,13 @@ static void cryptd_skcipher_encrypt(struct crypto_async_request *base,
struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req);
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_skcipher *child = ctx->child;
SKCIPHER_REQUEST_ON_STACK(subreq, child);
struct crypto_sync_skcipher *child = ctx->child;
SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child);
if (unlikely(err == -EINPROGRESS))
goto out;
skcipher_request_set_tfm(subreq, child);
skcipher_request_set_sync_tfm(subreq, child);
skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
NULL, NULL);
skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
@ -511,13 +513,13 @@ static void cryptd_skcipher_decrypt(struct crypto_async_request *base,
struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req);
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_skcipher *child = ctx->child;
SKCIPHER_REQUEST_ON_STACK(subreq, child);
struct crypto_sync_skcipher *child = ctx->child;
SYNC_SKCIPHER_REQUEST_ON_STACK(subreq, child);
if (unlikely(err == -EINPROGRESS))
goto out;
skcipher_request_set_tfm(subreq, child);
skcipher_request_set_sync_tfm(subreq, child);
skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP,
NULL, NULL);
skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
@ -568,7 +570,7 @@ static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm)
if (IS_ERR(cipher))
return PTR_ERR(cipher);
ctx->child = cipher;
ctx->child = (struct crypto_sync_skcipher *)cipher;
crypto_skcipher_set_reqsize(
tfm, sizeof(struct cryptd_skcipher_request_ctx));
return 0;
@ -578,7 +580,7 @@ static void cryptd_skcipher_exit_tfm(struct crypto_skcipher *tfm)
{
struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm);
crypto_free_skcipher(ctx->child);
crypto_free_sync_skcipher(ctx->child);
}
static void cryptd_skcipher_free(struct skcipher_instance *inst)
@ -1243,7 +1245,7 @@ struct crypto_skcipher *cryptd_skcipher_child(struct cryptd_skcipher *tfm)
{
struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base);
return ctx->child;
return &ctx->child->base;
}
EXPORT_SYMBOL_GPL(cryptd_skcipher_child);

View File

@ -26,7 +26,7 @@
#include <linux/string.h>
static DEFINE_MUTEX(crypto_default_null_skcipher_lock);
static struct crypto_skcipher *crypto_default_null_skcipher;
static struct crypto_sync_skcipher *crypto_default_null_skcipher;
static int crypto_default_null_skcipher_refcnt;
static int null_compress(struct crypto_tfm *tfm, const u8 *src,
@ -152,16 +152,15 @@ MODULE_ALIAS_CRYPTO("compress_null");
MODULE_ALIAS_CRYPTO("digest_null");
MODULE_ALIAS_CRYPTO("cipher_null");
struct crypto_skcipher *crypto_get_default_null_skcipher(void)
struct crypto_sync_skcipher *crypto_get_default_null_skcipher(void)
{
struct crypto_skcipher *tfm;
struct crypto_sync_skcipher *tfm;
mutex_lock(&crypto_default_null_skcipher_lock);
tfm = crypto_default_null_skcipher;
if (!tfm) {
tfm = crypto_alloc_skcipher("ecb(cipher_null)",
0, CRYPTO_ALG_ASYNC);
tfm = crypto_alloc_sync_skcipher("ecb(cipher_null)", 0, 0);
if (IS_ERR(tfm))
goto unlock;
@ -181,7 +180,7 @@ void crypto_put_default_null_skcipher(void)
{
mutex_lock(&crypto_default_null_skcipher_lock);
if (!--crypto_default_null_skcipher_refcnt) {
crypto_free_skcipher(crypto_default_null_skcipher);
crypto_free_sync_skcipher(crypto_default_null_skcipher);
crypto_default_null_skcipher = NULL;
}
mutex_unlock(&crypto_default_null_skcipher_lock);

View File

@ -29,6 +29,7 @@
#include <crypto/internal/rng.h>
#include <crypto/akcipher.h>
#include <crypto/kpp.h>
#include <crypto/internal/cryptouser.h>
#include "internal.h"
@ -37,7 +38,7 @@
static DEFINE_MUTEX(crypto_cfg_mutex);
/* The crypto netlink socket */
static struct sock *crypto_nlsk;
struct sock *crypto_nlsk;
struct crypto_dump_info {
struct sk_buff *in_skb;
@ -46,7 +47,7 @@ struct crypto_dump_info {
u16 nlmsg_flags;
};
static struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact)
struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact)
{
struct crypto_alg *q, *alg = NULL;
@ -461,6 +462,7 @@ static const int crypto_msg_min[CRYPTO_NR_MSGTYPES] = {
[CRYPTO_MSG_UPDATEALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
[CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
[CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = 0,
[CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
};
static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
@ -481,6 +483,9 @@ static const struct crypto_link {
.dump = crypto_dump_report,
.done = crypto_dump_report_done},
[CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
[CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = { .doit = crypto_reportstat,
.dump = crypto_dump_reportstat,
.done = crypto_dump_reportstat_done},
};
static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,

463
crypto/crypto_user_stat.c Normal file
View File

@ -0,0 +1,463 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Crypto user configuration API.
*
* Copyright (C) 2017-2018 Corentin Labbe <clabbe@baylibre.com>
*
*/
#include <linux/crypto.h>
#include <linux/cryptouser.h>
#include <linux/sched.h>
#include <net/netlink.h>
#include <crypto/internal/skcipher.h>
#include <crypto/internal/rng.h>
#include <crypto/akcipher.h>
#include <crypto/kpp.h>
#include <crypto/internal/cryptouser.h>
#include "internal.h"
#define null_terminated(x) (strnlen(x, sizeof(x)) < sizeof(x))
static DEFINE_MUTEX(crypto_cfg_mutex);
extern struct sock *crypto_nlsk;
struct crypto_dump_info {
struct sk_buff *in_skb;
struct sk_buff *out_skb;
u32 nlmsg_seq;
u16 nlmsg_flags;
};
static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat raead;
u64 v64;
u32 v32;
strncpy(raead.type, "aead", sizeof(raead.type));
v32 = atomic_read(&alg->encrypt_cnt);
raead.stat_encrypt_cnt = v32;
v64 = atomic64_read(&alg->encrypt_tlen);
raead.stat_encrypt_tlen = v64;
v32 = atomic_read(&alg->decrypt_cnt);
raead.stat_decrypt_cnt = v32;
v64 = atomic64_read(&alg->decrypt_tlen);
raead.stat_decrypt_tlen = v64;
v32 = atomic_read(&alg->aead_err_cnt);
raead.stat_aead_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_AEAD,
sizeof(struct crypto_stat), &raead))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rcipher;
u64 v64;
u32 v32;
strlcpy(rcipher.type, "cipher", sizeof(rcipher.type));
v32 = atomic_read(&alg->encrypt_cnt);
rcipher.stat_encrypt_cnt = v32;
v64 = atomic64_read(&alg->encrypt_tlen);
rcipher.stat_encrypt_tlen = v64;
v32 = atomic_read(&alg->decrypt_cnt);
rcipher.stat_decrypt_cnt = v32;
v64 = atomic64_read(&alg->decrypt_tlen);
rcipher.stat_decrypt_tlen = v64;
v32 = atomic_read(&alg->cipher_err_cnt);
rcipher.stat_cipher_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_CIPHER,
sizeof(struct crypto_stat), &rcipher))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rcomp;
u64 v64;
u32 v32;
strlcpy(rcomp.type, "compression", sizeof(rcomp.type));
v32 = atomic_read(&alg->compress_cnt);
rcomp.stat_compress_cnt = v32;
v64 = atomic64_read(&alg->compress_tlen);
rcomp.stat_compress_tlen = v64;
v32 = atomic_read(&alg->decompress_cnt);
rcomp.stat_decompress_cnt = v32;
v64 = atomic64_read(&alg->decompress_tlen);
rcomp.stat_decompress_tlen = v64;
v32 = atomic_read(&alg->cipher_err_cnt);
rcomp.stat_compress_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_COMPRESS,
sizeof(struct crypto_stat), &rcomp))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat racomp;
u64 v64;
u32 v32;
strlcpy(racomp.type, "acomp", sizeof(racomp.type));
v32 = atomic_read(&alg->compress_cnt);
racomp.stat_compress_cnt = v32;
v64 = atomic64_read(&alg->compress_tlen);
racomp.stat_compress_tlen = v64;
v32 = atomic_read(&alg->decompress_cnt);
racomp.stat_decompress_cnt = v32;
v64 = atomic64_read(&alg->decompress_tlen);
racomp.stat_decompress_tlen = v64;
v32 = atomic_read(&alg->cipher_err_cnt);
racomp.stat_compress_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_ACOMP,
sizeof(struct crypto_stat), &racomp))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rakcipher;
u64 v64;
u32 v32;
strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
v32 = atomic_read(&alg->encrypt_cnt);
rakcipher.stat_encrypt_cnt = v32;
v64 = atomic64_read(&alg->encrypt_tlen);
rakcipher.stat_encrypt_tlen = v64;
v32 = atomic_read(&alg->decrypt_cnt);
rakcipher.stat_decrypt_cnt = v32;
v64 = atomic64_read(&alg->decrypt_tlen);
rakcipher.stat_decrypt_tlen = v64;
v32 = atomic_read(&alg->sign_cnt);
rakcipher.stat_sign_cnt = v32;
v32 = atomic_read(&alg->verify_cnt);
rakcipher.stat_verify_cnt = v32;
v32 = atomic_read(&alg->akcipher_err_cnt);
rakcipher.stat_akcipher_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
sizeof(struct crypto_stat), &rakcipher))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rkpp;
u32 v;
strlcpy(rkpp.type, "kpp", sizeof(rkpp.type));
v = atomic_read(&alg->setsecret_cnt);
rkpp.stat_setsecret_cnt = v;
v = atomic_read(&alg->generate_public_key_cnt);
rkpp.stat_generate_public_key_cnt = v;
v = atomic_read(&alg->compute_shared_secret_cnt);
rkpp.stat_compute_shared_secret_cnt = v;
v = atomic_read(&alg->kpp_err_cnt);
rkpp.stat_kpp_err_cnt = v;
if (nla_put(skb, CRYPTOCFGA_STAT_KPP,
sizeof(struct crypto_stat), &rkpp))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rhash;
u64 v64;
u32 v32;
strncpy(rhash.type, "ahash", sizeof(rhash.type));
v32 = atomic_read(&alg->hash_cnt);
rhash.stat_hash_cnt = v32;
v64 = atomic64_read(&alg->hash_tlen);
rhash.stat_hash_tlen = v64;
v32 = atomic_read(&alg->hash_err_cnt);
rhash.stat_hash_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
sizeof(struct crypto_stat), &rhash))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rhash;
u64 v64;
u32 v32;
strncpy(rhash.type, "shash", sizeof(rhash.type));
v32 = atomic_read(&alg->hash_cnt);
rhash.stat_hash_cnt = v32;
v64 = atomic64_read(&alg->hash_tlen);
rhash.stat_hash_tlen = v64;
v32 = atomic_read(&alg->hash_err_cnt);
rhash.stat_hash_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
sizeof(struct crypto_stat), &rhash))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
{
struct crypto_stat rrng;
u64 v64;
u32 v32;
strncpy(rrng.type, "rng", sizeof(rrng.type));
v32 = atomic_read(&alg->generate_cnt);
rrng.stat_generate_cnt = v32;
v64 = atomic64_read(&alg->generate_tlen);
rrng.stat_generate_tlen = v64;
v32 = atomic_read(&alg->seed_cnt);
rrng.stat_seed_cnt = v32;
v32 = atomic_read(&alg->hash_err_cnt);
rrng.stat_rng_err_cnt = v32;
if (nla_put(skb, CRYPTOCFGA_STAT_RNG,
sizeof(struct crypto_stat), &rrng))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_reportstat_one(struct crypto_alg *alg,
struct crypto_user_alg *ualg,
struct sk_buff *skb)
{
strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
strlcpy(ualg->cru_driver_name, alg->cra_driver_name,
sizeof(ualg->cru_driver_name));
strlcpy(ualg->cru_module_name, module_name(alg->cra_module),
sizeof(ualg->cru_module_name));
ualg->cru_type = 0;
ualg->cru_mask = 0;
ualg->cru_flags = alg->cra_flags;
ualg->cru_refcnt = refcount_read(&alg->cra_refcnt);
if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
goto nla_put_failure;
if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
struct crypto_stat rl;
strlcpy(rl.type, "larval", sizeof(rl.type));
if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL,
sizeof(struct crypto_stat), &rl))
goto nla_put_failure;
goto out;
}
switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
case CRYPTO_ALG_TYPE_AEAD:
if (crypto_report_aead(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_SKCIPHER:
if (crypto_report_cipher(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_BLKCIPHER:
if (crypto_report_cipher(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_CIPHER:
if (crypto_report_cipher(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_COMPRESS:
if (crypto_report_comp(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_ACOMPRESS:
if (crypto_report_acomp(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_SCOMPRESS:
if (crypto_report_acomp(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_AKCIPHER:
if (crypto_report_akcipher(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_KPP:
if (crypto_report_kpp(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_AHASH:
if (crypto_report_ahash(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_HASH:
if (crypto_report_shash(skb, alg))
goto nla_put_failure;
break;
case CRYPTO_ALG_TYPE_RNG:
if (crypto_report_rng(skb, alg))
goto nla_put_failure;
break;
default:
pr_err("ERROR: Unhandled alg %d in %s\n",
alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL),
__func__);
}
out:
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static int crypto_reportstat_alg(struct crypto_alg *alg,
struct crypto_dump_info *info)
{
struct sk_buff *in_skb = info->in_skb;
struct sk_buff *skb = info->out_skb;
struct nlmsghdr *nlh;
struct crypto_user_alg *ualg;
int err = 0;
nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq,
CRYPTO_MSG_GETSTAT, sizeof(*ualg), info->nlmsg_flags);
if (!nlh) {
err = -EMSGSIZE;
goto out;
}
ualg = nlmsg_data(nlh);
err = crypto_reportstat_one(alg, ualg, skb);
if (err) {
nlmsg_cancel(skb, nlh);
goto out;
}
nlmsg_end(skb, nlh);
out:
return err;
}
int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
struct nlattr **attrs)
{
struct crypto_user_alg *p = nlmsg_data(in_nlh);
struct crypto_alg *alg;
struct sk_buff *skb;
struct crypto_dump_info info;
int err;
if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
return -EINVAL;
alg = crypto_alg_match(p, 0);
if (!alg)
return -ENOENT;
err = -ENOMEM;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb)
goto drop_alg;
info.in_skb = in_skb;
info.out_skb = skb;
info.nlmsg_seq = in_nlh->nlmsg_seq;
info.nlmsg_flags = 0;
err = crypto_reportstat_alg(alg, &info);
drop_alg:
crypto_mod_put(alg);
if (err)
return err;
return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
}
int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb)
{
struct crypto_alg *alg;
struct crypto_dump_info info;
int err;
if (cb->args[0])
goto out;
cb->args[0] = 1;
info.in_skb = cb->skb;
info.out_skb = skb;
info.nlmsg_seq = cb->nlh->nlmsg_seq;
info.nlmsg_flags = NLM_F_MULTI;
list_for_each_entry(alg, &crypto_alg_list, cra_list) {
err = crypto_reportstat_alg(alg, &info);
if (err)
goto out_err;
}
out:
return skb->len;
out_err:
return err;
}
int crypto_dump_reportstat_done(struct netlink_callback *cb)
{
return 0;
}
MODULE_LICENSE("GPL");

View File

@ -47,9 +47,9 @@ static int echainiv_encrypt(struct aead_request *req)
info = req->iv;
if (req->src != req->dst) {
SKCIPHER_REQUEST_ON_STACK(nreq, ctx->sknull);
SYNC_SKCIPHER_REQUEST_ON_STACK(nreq, ctx->sknull);
skcipher_request_set_tfm(nreq, ctx->sknull);
skcipher_request_set_sync_tfm(nreq, ctx->sknull);
skcipher_request_set_callback(nreq, req->base.flags,
NULL, NULL);
skcipher_request_set_crypt(nreq, req->src, req->dst,

View File

@ -50,7 +50,7 @@ struct crypto_rfc4543_instance_ctx {
struct crypto_rfc4543_ctx {
struct crypto_aead *child;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
u8 nonce[4];
};
@ -1067,9 +1067,9 @@ static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc)
unsigned int authsize = crypto_aead_authsize(aead);
unsigned int nbytes = req->assoclen + req->cryptlen -
(enc ? 0 : authsize);
SKCIPHER_REQUEST_ON_STACK(nreq, ctx->null);
SYNC_SKCIPHER_REQUEST_ON_STACK(nreq, ctx->null);
skcipher_request_set_tfm(nreq, ctx->null);
skcipher_request_set_sync_tfm(nreq, ctx->null);
skcipher_request_set_callback(nreq, req->base.flags, NULL, NULL);
skcipher_request_set_crypt(nreq, req->src, req->dst, nbytes, NULL);
@ -1093,7 +1093,7 @@ static int crypto_rfc4543_init_tfm(struct crypto_aead *tfm)
struct crypto_aead_spawn *spawn = &ictx->aead;
struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(tfm);
struct crypto_aead *aead;
struct crypto_skcipher *null;
struct crypto_sync_skcipher *null;
unsigned long align;
int err = 0;

View File

@ -26,12 +26,6 @@
#include <linux/rwsem.h>
#include <linux/slab.h>
/* Crypto notification events. */
enum {
CRYPTO_MSG_ALG_REQUEST,
CRYPTO_MSG_ALG_REGISTER,
};
struct crypto_instance;
struct crypto_template;
@ -90,8 +84,6 @@ struct crypto_alg *crypto_find_alg(const char *alg_name,
void *crypto_alloc_tfm(const char *alg_name,
const struct crypto_type *frontend, u32 type, u32 mask);
int crypto_register_notifier(struct notifier_block *nb);
int crypto_unregister_notifier(struct notifier_block *nb);
int crypto_probing_notify(unsigned long val, void *v);
unsigned int crypto_alg_extsize(struct crypto_alg *alg);

View File

@ -29,8 +29,6 @@
#include <crypto/b128ops.h>
#include <crypto/gf128mul.h>
#define LRW_BUFFER_SIZE 128u
#define LRW_BLOCK_SIZE 16
struct priv {
@ -56,19 +54,7 @@ struct priv {
};
struct rctx {
be128 buf[LRW_BUFFER_SIZE / sizeof(be128)];
be128 t;
be128 *ext;
struct scatterlist srcbuf[2];
struct scatterlist dstbuf[2];
struct scatterlist *src;
struct scatterlist *dst;
unsigned int left;
struct skcipher_request subreq;
};
@ -120,112 +106,68 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key,
return 0;
}
static inline void inc(be128 *iv)
/*
* Returns the number of trailing '1' bits in the words of the counter, which is
* represented by 4 32-bit words, arranged from least to most significant.
* At the same time, increments the counter by one.
*
* For example:
*
* u32 counter[4] = { 0xFFFFFFFF, 0x1, 0x0, 0x0 };
* int i = next_index(&counter);
* // i == 33, counter == { 0x0, 0x2, 0x0, 0x0 }
*/
static int next_index(u32 *counter)
{
be64_add_cpu(&iv->b, 1);
if (!iv->b)
be64_add_cpu(&iv->a, 1);
}
int i, res = 0;
/* this returns the number of consequative 1 bits starting
* from the right, get_index128(00 00 00 00 00 00 ... 00 00 10 FB) = 2 */
static inline int get_index128(be128 *block)
{
int x;
__be32 *p = (__be32 *) block;
for (i = 0; i < 4; i++) {
if (counter[i] + 1 != 0)
return res + ffz(counter[i]++);
for (p += 3, x = 0; x < 128; p--, x += 32) {
u32 val = be32_to_cpup(p);
if (!~val)
continue;
return x + ffz(val);
counter[i] = 0;
res += 32;
}
return x;
/*
* If we get here, then x == 128 and we are incrementing the counter
* from all ones to all zeros. This means we must return index 127, i.e.
* the one corresponding to key2*{ 1,...,1 }.
*/
return 127;
}
static int post_crypt(struct skcipher_request *req)
/*
* We compute the tweak masks twice (both before and after the ECB encryption or
* decryption) to avoid having to allocate a temporary buffer and/or make
* mutliple calls to the 'ecb(..)' instance, which usually would be slower than
* just doing the next_index() calls again.
*/
static int xor_tweak(struct skcipher_request *req, bool second_pass)
{
struct rctx *rctx = skcipher_request_ctx(req);
be128 *buf = rctx->ext ?: rctx->buf;
struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
struct skcipher_walk w;
struct scatterlist *sg;
unsigned offset;
int err;
subreq = &rctx->subreq;
err = skcipher_walk_virt(&w, subreq, false);
while (w.nbytes) {
unsigned int avail = w.nbytes;
be128 *wdst;
wdst = w.dst.virt.addr;
do {
be128_xor(wdst, buf++, wdst);
wdst++;
} while ((avail -= bs) >= bs);
err = skcipher_walk_done(&w, avail);
}
rctx->left -= subreq->cryptlen;
if (err || !rctx->left)
goto out;
rctx->dst = rctx->dstbuf;
scatterwalk_done(&w.out, 0, 1);
sg = w.out.sg;
offset = w.out.offset;
if (rctx->dst != sg) {
rctx->dst[0] = *sg;
sg_unmark_end(rctx->dst);
scatterwalk_crypto_chain(rctx->dst, sg_next(sg), 2);
}
rctx->dst[0].length -= offset - sg->offset;
rctx->dst[0].offset = offset;
out:
return err;
}
static int pre_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct rctx *rctx = skcipher_request_ctx(req);
struct priv *ctx = crypto_skcipher_ctx(tfm);
be128 *buf = rctx->ext ?: rctx->buf;
struct skcipher_request *subreq;
const int bs = LRW_BLOCK_SIZE;
struct rctx *rctx = skcipher_request_ctx(req);
be128 t = rctx->t;
struct skcipher_walk w;
struct scatterlist *sg;
unsigned cryptlen;
unsigned offset;
be128 *iv;
bool more;
__be32 *iv;
u32 counter[4];
int err;
subreq = &rctx->subreq;
skcipher_request_set_tfm(subreq, tfm);
if (second_pass) {
req = &rctx->subreq;
/* set to our TFM to enforce correct alignment: */
skcipher_request_set_tfm(req, tfm);
}
cryptlen = subreq->cryptlen;
more = rctx->left > cryptlen;
if (!more)
cryptlen = rctx->left;
err = skcipher_walk_virt(&w, req, false);
iv = (__be32 *)w.iv;
skcipher_request_set_crypt(subreq, rctx->src, rctx->dst,
cryptlen, req->iv);
err = skcipher_walk_virt(&w, subreq, false);
iv = w.iv;
counter[0] = be32_to_cpu(iv[3]);
counter[1] = be32_to_cpu(iv[2]);
counter[2] = be32_to_cpu(iv[1]);
counter[3] = be32_to_cpu(iv[0]);
while (w.nbytes) {
unsigned int avail = w.nbytes;
@ -236,188 +178,85 @@ static int pre_crypt(struct skcipher_request *req)
wdst = w.dst.virt.addr;
do {
*buf++ = rctx->t;
be128_xor(wdst++, &rctx->t, wsrc++);
be128_xor(wdst++, &t, wsrc++);
/* T <- I*Key2, using the optimization
* discussed in the specification */
be128_xor(&rctx->t, &rctx->t,
&ctx->mulinc[get_index128(iv)]);
inc(iv);
be128_xor(&t, &t, &ctx->mulinc[next_index(counter)]);
} while ((avail -= bs) >= bs);
if (second_pass && w.nbytes == w.total) {
iv[0] = cpu_to_be32(counter[3]);
iv[1] = cpu_to_be32(counter[2]);
iv[2] = cpu_to_be32(counter[1]);
iv[3] = cpu_to_be32(counter[0]);
}
err = skcipher_walk_done(&w, avail);
}
skcipher_request_set_tfm(subreq, ctx->child);
skcipher_request_set_crypt(subreq, rctx->dst, rctx->dst,
cryptlen, NULL);
if (err || !more)
goto out;
rctx->src = rctx->srcbuf;
scatterwalk_done(&w.in, 0, 1);
sg = w.in.sg;
offset = w.in.offset;
if (rctx->src != sg) {
rctx->src[0] = *sg;
sg_unmark_end(rctx->src);
scatterwalk_crypto_chain(rctx->src, sg_next(sg), 2);
}
rctx->src[0].length -= offset - sg->offset;
rctx->src[0].offset = offset;
out:
return err;
}
static int init_crypt(struct skcipher_request *req, crypto_completion_t done)
static int xor_tweak_pre(struct skcipher_request *req)
{
return xor_tweak(req, false);
}
static int xor_tweak_post(struct skcipher_request *req)
{
return xor_tweak(req, true);
}
static void crypt_done(struct crypto_async_request *areq, int err)
{
struct skcipher_request *req = areq->data;
if (!err)
err = xor_tweak_post(req);
skcipher_request_complete(req, err);
}
static void init_crypt(struct skcipher_request *req)
{
struct priv *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
struct rctx *rctx = skcipher_request_ctx(req);
struct skcipher_request *subreq;
gfp_t gfp;
struct skcipher_request *subreq = &rctx->subreq;
subreq = &rctx->subreq;
skcipher_request_set_callback(subreq, req->base.flags, done, req);
gfp = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
GFP_ATOMIC;
rctx->ext = NULL;
subreq->cryptlen = LRW_BUFFER_SIZE;
if (req->cryptlen > LRW_BUFFER_SIZE) {
unsigned int n = min(req->cryptlen, (unsigned int)PAGE_SIZE);
rctx->ext = kmalloc(n, gfp);
if (rctx->ext)
subreq->cryptlen = n;
}
rctx->src = req->src;
rctx->dst = req->dst;
rctx->left = req->cryptlen;
skcipher_request_set_tfm(subreq, ctx->child);
skcipher_request_set_callback(subreq, req->base.flags, crypt_done, req);
/* pass req->iv as IV (will be used by xor_tweak, ECB will ignore it) */
skcipher_request_set_crypt(subreq, req->dst, req->dst,
req->cryptlen, req->iv);
/* calculate first value of T */
memcpy(&rctx->t, req->iv, sizeof(rctx->t));
/* T <- I*Key2 */
gf128mul_64k_bbe(&rctx->t, ctx->table);
return 0;
}
static void exit_crypt(struct skcipher_request *req)
{
struct rctx *rctx = skcipher_request_ctx(req);
rctx->left = 0;
if (rctx->ext)
kzfree(rctx->ext);
}
static int do_encrypt(struct skcipher_request *req, int err)
{
struct rctx *rctx = skcipher_request_ctx(req);
struct skcipher_request *subreq;
subreq = &rctx->subreq;
while (!err && rctx->left) {
err = pre_crypt(req) ?:
crypto_skcipher_encrypt(subreq) ?:
post_crypt(req);
if (err == -EINPROGRESS || err == -EBUSY)
return err;
}
exit_crypt(req);
return err;
}
static void encrypt_done(struct crypto_async_request *areq, int err)
{
struct skcipher_request *req = areq->data;
struct skcipher_request *subreq;
struct rctx *rctx;
rctx = skcipher_request_ctx(req);
if (err == -EINPROGRESS) {
if (rctx->left != req->cryptlen)
return;
goto out;
}
subreq = &rctx->subreq;
subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
err = do_encrypt(req, err ?: post_crypt(req));
if (rctx->left)
return;
out:
skcipher_request_complete(req, err);
}
static int encrypt(struct skcipher_request *req)
{
return do_encrypt(req, init_crypt(req, encrypt_done));
}
static int do_decrypt(struct skcipher_request *req, int err)
{
struct rctx *rctx = skcipher_request_ctx(req);
struct skcipher_request *subreq;
struct skcipher_request *subreq = &rctx->subreq;
subreq = &rctx->subreq;
while (!err && rctx->left) {
err = pre_crypt(req) ?:
crypto_skcipher_decrypt(subreq) ?:
post_crypt(req);
if (err == -EINPROGRESS || err == -EBUSY)
return err;
}
exit_crypt(req);
return err;
}
static void decrypt_done(struct crypto_async_request *areq, int err)
{
struct skcipher_request *req = areq->data;
struct skcipher_request *subreq;
struct rctx *rctx;
rctx = skcipher_request_ctx(req);
if (err == -EINPROGRESS) {
if (rctx->left != req->cryptlen)
return;
goto out;
}
subreq = &rctx->subreq;
subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
err = do_decrypt(req, err ?: post_crypt(req));
if (rctx->left)
return;
out:
skcipher_request_complete(req, err);
init_crypt(req);
return xor_tweak_pre(req) ?:
crypto_skcipher_encrypt(subreq) ?:
xor_tweak_post(req);
}
static int decrypt(struct skcipher_request *req)
{
return do_decrypt(req, init_crypt(req, decrypt_done));
struct rctx *rctx = skcipher_request_ctx(req);
struct skcipher_request *subreq = &rctx->subreq;
init_crypt(req);
return xor_tweak_pre(req) ?:
crypto_skcipher_decrypt(subreq) ?:
xor_tweak_post(req);
}
static int init_tfm(struct crypto_skcipher *tfm)
@ -543,7 +382,7 @@ static int create(struct crypto_template *tmpl, struct rtattr **tb)
inst->alg.base.cra_priority = alg->base.cra_priority;
inst->alg.base.cra_blocksize = LRW_BLOCK_SIZE;
inst->alg.base.cra_alignmask = alg->base.cra_alignmask |
(__alignof__(u64) - 1);
(__alignof__(__be32) - 1);
inst->alg.ivsize = LRW_BLOCK_SIZE;
inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg) +

View File

@ -1,675 +0,0 @@
/*
* Software multibuffer async crypto daemon.
*
* Copyright (c) 2014 Tim Chen <tim.c.chen@linux.intel.com>
*
* Adapted from crypto daemon.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/aead.h>
#include <crypto/mcryptd.h>
#include <crypto/crypto_wq.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/sched.h>
#include <linux/sched/stat.h>
#include <linux/slab.h>
#define MCRYPTD_MAX_CPU_QLEN 100
#define MCRYPTD_BATCH 9
static void *mcryptd_alloc_instance(struct crypto_alg *alg, unsigned int head,
unsigned int tail);
struct mcryptd_flush_list {
struct list_head list;
struct mutex lock;
};
static struct mcryptd_flush_list __percpu *mcryptd_flist;
struct hashd_instance_ctx {
struct crypto_ahash_spawn spawn;
struct mcryptd_queue *queue;
};
static void mcryptd_queue_worker(struct work_struct *work);
void mcryptd_arm_flusher(struct mcryptd_alg_cstate *cstate, unsigned long delay)
{
struct mcryptd_flush_list *flist;
if (!cstate->flusher_engaged) {
/* put the flusher on the flush list */
flist = per_cpu_ptr(mcryptd_flist, smp_processor_id());
mutex_lock(&flist->lock);
list_add_tail(&cstate->flush_list, &flist->list);
cstate->flusher_engaged = true;
cstate->next_flush = jiffies + delay;
queue_delayed_work_on(smp_processor_id(), kcrypto_wq,
&cstate->flush, delay);
mutex_unlock(&flist->lock);
}
}
EXPORT_SYMBOL(mcryptd_arm_flusher);
static int mcryptd_init_queue(struct mcryptd_queue *queue,
unsigned int max_cpu_qlen)
{
int cpu;
struct mcryptd_cpu_queue *cpu_queue;
queue->cpu_queue = alloc_percpu(struct mcryptd_cpu_queue);
pr_debug("mqueue:%p mcryptd_cpu_queue %p\n", queue, queue->cpu_queue);
if (!queue->cpu_queue)
return -ENOMEM;
for_each_possible_cpu(cpu) {
cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
spin_lock_init(&cpu_queue->q_lock);
}
return 0;
}
static void mcryptd_fini_queue(struct mcryptd_queue *queue)
{
int cpu;
struct mcryptd_cpu_queue *cpu_queue;
for_each_possible_cpu(cpu) {
cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
BUG_ON(cpu_queue->queue.qlen);
}
free_percpu(queue->cpu_queue);
}
static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
struct crypto_async_request *request,
struct mcryptd_hash_request_ctx *rctx)
{
int cpu, err;
struct mcryptd_cpu_queue *cpu_queue;
cpu_queue = raw_cpu_ptr(queue->cpu_queue);
spin_lock(&cpu_queue->q_lock);
cpu = smp_processor_id();
rctx->tag.cpu = smp_processor_id();
err = crypto_enqueue_request(&cpu_queue->queue, request);
pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
cpu, cpu_queue, request);
spin_unlock(&cpu_queue->q_lock);
queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
return err;
}
/*
* Try to opportunisticlly flush the partially completed jobs if
* crypto daemon is the only task running.
*/
static void mcryptd_opportunistic_flush(void)
{
struct mcryptd_flush_list *flist;
struct mcryptd_alg_cstate *cstate;
flist = per_cpu_ptr(mcryptd_flist, smp_processor_id());
while (single_task_running()) {
mutex_lock(&flist->lock);
cstate = list_first_entry_or_null(&flist->list,
struct mcryptd_alg_cstate, flush_list);
if (!cstate || !cstate->flusher_engaged) {
mutex_unlock(&flist->lock);
return;
}
list_del(&cstate->flush_list);
cstate->flusher_engaged = false;
mutex_unlock(&flist->lock);
cstate->alg_state->flusher(cstate);
}
}
/*
* Called in workqueue context, do one real cryption work (via
* req->complete) and reschedule itself if there are more work to
* do.
*/
static void mcryptd_queue_worker(struct work_struct *work)
{
struct mcryptd_cpu_queue *cpu_queue;
struct crypto_async_request *req, *backlog;
int i;
/*
* Need to loop through more than once for multi-buffer to
* be effective.
*/
cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
i = 0;
while (i < MCRYPTD_BATCH || single_task_running()) {
spin_lock_bh(&cpu_queue->q_lock);
backlog = crypto_get_backlog(&cpu_queue->queue);
req = crypto_dequeue_request(&cpu_queue->queue);
spin_unlock_bh(&cpu_queue->q_lock);
if (!req) {
mcryptd_opportunistic_flush();
return;
}
if (backlog)
backlog->complete(backlog, -EINPROGRESS);
req->complete(req, 0);
if (!cpu_queue->queue.qlen)
return;
++i;
}
if (cpu_queue->queue.qlen)
queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
}
void mcryptd_flusher(struct work_struct *__work)
{
struct mcryptd_alg_cstate *alg_cpu_state;
struct mcryptd_alg_state *alg_state;
struct mcryptd_flush_list *flist;
int cpu;
cpu = smp_processor_id();
alg_cpu_state = container_of(to_delayed_work(__work),
struct mcryptd_alg_cstate, flush);
alg_state = alg_cpu_state->alg_state;
if (alg_cpu_state->cpu != cpu)
pr_debug("mcryptd error: work on cpu %d, should be cpu %d\n",
cpu, alg_cpu_state->cpu);
if (alg_cpu_state->flusher_engaged) {
flist = per_cpu_ptr(mcryptd_flist, cpu);
mutex_lock(&flist->lock);
list_del(&alg_cpu_state->flush_list);
alg_cpu_state->flusher_engaged = false;
mutex_unlock(&flist->lock);
alg_state->flusher(alg_cpu_state);
}
}
EXPORT_SYMBOL_GPL(mcryptd_flusher);
static inline struct mcryptd_queue *mcryptd_get_queue(struct crypto_tfm *tfm)
{
struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
struct mcryptd_instance_ctx *ictx = crypto_instance_ctx(inst);
return ictx->queue;
}
static void *mcryptd_alloc_instance(struct crypto_alg *alg, unsigned int head,
unsigned int tail)
{
char *p;
struct crypto_instance *inst;
int err;
p = kzalloc(head + sizeof(*inst) + tail, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
inst = (void *)(p + head);
err = -ENAMETOOLONG;
if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
"mcryptd(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
goto out_free_inst;
memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME);
inst->alg.cra_priority = alg->cra_priority + 50;
inst->alg.cra_blocksize = alg->cra_blocksize;
inst->alg.cra_alignmask = alg->cra_alignmask;
out:
return p;
out_free_inst:
kfree(p);
p = ERR_PTR(err);
goto out;
}
static inline bool mcryptd_check_internal(struct rtattr **tb, u32 *type,
u32 *mask)
{
struct crypto_attr_type *algt;
algt = crypto_get_attr_type(tb);
if (IS_ERR(algt))
return false;
*type |= algt->type & CRYPTO_ALG_INTERNAL;
*mask |= algt->mask & CRYPTO_ALG_INTERNAL;
if (*type & *mask & CRYPTO_ALG_INTERNAL)
return true;
else
return false;
}
static int mcryptd_hash_init_tfm(struct crypto_tfm *tfm)
{
struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
struct hashd_instance_ctx *ictx = crypto_instance_ctx(inst);
struct crypto_ahash_spawn *spawn = &ictx->spawn;
struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(tfm);
struct crypto_ahash *hash;
hash = crypto_spawn_ahash(spawn);
if (IS_ERR(hash))
return PTR_ERR(hash);
ctx->child = hash;
crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
sizeof(struct mcryptd_hash_request_ctx) +
crypto_ahash_reqsize(hash));
return 0;
}
static void mcryptd_hash_exit_tfm(struct crypto_tfm *tfm)
{
struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(tfm);
crypto_free_ahash(ctx->child);
}
static int mcryptd_hash_setkey(struct crypto_ahash *parent,
const u8 *key, unsigned int keylen)
{
struct mcryptd_hash_ctx *ctx = crypto_ahash_ctx(parent);
struct crypto_ahash *child = ctx->child;
int err;
crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_ahash_set_flags(child, crypto_ahash_get_flags(parent) &
CRYPTO_TFM_REQ_MASK);
err = crypto_ahash_setkey(child, key, keylen);
crypto_ahash_set_flags(parent, crypto_ahash_get_flags(child) &
CRYPTO_TFM_RES_MASK);
return err;
}
static int mcryptd_hash_enqueue(struct ahash_request *req,
crypto_completion_t complete)
{
int ret;
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
struct mcryptd_queue *queue =
mcryptd_get_queue(crypto_ahash_tfm(tfm));
rctx->complete = req->base.complete;
req->base.complete = complete;
ret = mcryptd_enqueue_request(queue, &req->base, rctx);
return ret;
}
static void mcryptd_hash_init(struct crypto_async_request *req_async, int err)
{
struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm);
struct crypto_ahash *child = ctx->child;
struct ahash_request *req = ahash_request_cast(req_async);
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
struct ahash_request *desc = &rctx->areq;
if (unlikely(err == -EINPROGRESS))
goto out;
ahash_request_set_tfm(desc, child);
ahash_request_set_callback(desc, CRYPTO_TFM_REQ_MAY_SLEEP,
rctx->complete, req_async);
rctx->out = req->result;
err = crypto_ahash_init(desc);
out:
local_bh_disable();
rctx->complete(&req->base, err);
local_bh_enable();
}
static int mcryptd_hash_init_enqueue(struct ahash_request *req)
{
return mcryptd_hash_enqueue(req, mcryptd_hash_init);
}
static void mcryptd_hash_update(struct crypto_async_request *req_async, int err)
{
struct ahash_request *req = ahash_request_cast(req_async);
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
if (unlikely(err == -EINPROGRESS))
goto out;
rctx->out = req->result;
err = crypto_ahash_update(&rctx->areq);
if (err) {
req->base.complete = rctx->complete;
goto out;
}
return;
out:
local_bh_disable();
rctx->complete(&req->base, err);
local_bh_enable();
}
static int mcryptd_hash_update_enqueue(struct ahash_request *req)
{
return mcryptd_hash_enqueue(req, mcryptd_hash_update);
}
static void mcryptd_hash_final(struct crypto_async_request *req_async, int err)
{
struct ahash_request *req = ahash_request_cast(req_async);
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
if (unlikely(err == -EINPROGRESS))
goto out;
rctx->out = req->result;
err = crypto_ahash_final(&rctx->areq);
if (err) {
req->base.complete = rctx->complete;
goto out;
}
return;
out:
local_bh_disable();
rctx->complete(&req->base, err);
local_bh_enable();
}
static int mcryptd_hash_final_enqueue(struct ahash_request *req)
{
return mcryptd_hash_enqueue(req, mcryptd_hash_final);
}
static void mcryptd_hash_finup(struct crypto_async_request *req_async, int err)
{
struct ahash_request *req = ahash_request_cast(req_async);
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
if (unlikely(err == -EINPROGRESS))
goto out;
rctx->out = req->result;
err = crypto_ahash_finup(&rctx->areq);
if (err) {
req->base.complete = rctx->complete;
goto out;
}
return;
out:
local_bh_disable();
rctx->complete(&req->base, err);
local_bh_enable();
}
static int mcryptd_hash_finup_enqueue(struct ahash_request *req)
{
return mcryptd_hash_enqueue(req, mcryptd_hash_finup);
}
static void mcryptd_hash_digest(struct crypto_async_request *req_async, int err)
{
struct mcryptd_hash_ctx *ctx = crypto_tfm_ctx(req_async->tfm);
struct crypto_ahash *child = ctx->child;
struct ahash_request *req = ahash_request_cast(req_async);
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
struct ahash_request *desc = &rctx->areq;
if (unlikely(err == -EINPROGRESS))
goto out;
ahash_request_set_tfm(desc, child);
ahash_request_set_callback(desc, CRYPTO_TFM_REQ_MAY_SLEEP,
rctx->complete, req_async);
rctx->out = req->result;
err = crypto_ahash_init(desc) ?: crypto_ahash_finup(desc);
out:
local_bh_disable();
rctx->complete(&req->base, err);
local_bh_enable();
}
static int mcryptd_hash_digest_enqueue(struct ahash_request *req)
{
return mcryptd_hash_enqueue(req, mcryptd_hash_digest);
}
static int mcryptd_hash_export(struct ahash_request *req, void *out)
{
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
return crypto_ahash_export(&rctx->areq, out);
}
static int mcryptd_hash_import(struct ahash_request *req, const void *in)
{
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
return crypto_ahash_import(&rctx->areq, in);
}
static int mcryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
struct mcryptd_queue *queue)
{
struct hashd_instance_ctx *ctx;
struct ahash_instance *inst;
struct hash_alg_common *halg;
struct crypto_alg *alg;
u32 type = 0;
u32 mask = 0;
int err;
if (!mcryptd_check_internal(tb, &type, &mask))
return -EINVAL;
halg = ahash_attr_alg(tb[1], type, mask);
if (IS_ERR(halg))
return PTR_ERR(halg);
alg = &halg->base;
pr_debug("crypto: mcryptd hash alg: %s\n", alg->cra_name);
inst = mcryptd_alloc_instance(alg, ahash_instance_headroom(),
sizeof(*ctx));
err = PTR_ERR(inst);
if (IS_ERR(inst))
goto out_put_alg;
ctx = ahash_instance_ctx(inst);
ctx->queue = queue;
err = crypto_init_ahash_spawn(&ctx->spawn, halg,
ahash_crypto_instance(inst));
if (err)
goto out_free_inst;
inst->alg.halg.base.cra_flags = CRYPTO_ALG_ASYNC |
(alg->cra_flags & (CRYPTO_ALG_INTERNAL |
CRYPTO_ALG_OPTIONAL_KEY));
inst->alg.halg.digestsize = halg->digestsize;
inst->alg.halg.statesize = halg->statesize;
inst->alg.halg.base.cra_ctxsize = sizeof(struct mcryptd_hash_ctx);
inst->alg.halg.base.cra_init = mcryptd_hash_init_tfm;
inst->alg.halg.base.cra_exit = mcryptd_hash_exit_tfm;
inst->alg.init = mcryptd_hash_init_enqueue;
inst->alg.update = mcryptd_hash_update_enqueue;
inst->alg.final = mcryptd_hash_final_enqueue;
inst->alg.finup = mcryptd_hash_finup_enqueue;
inst->alg.export = mcryptd_hash_export;
inst->alg.import = mcryptd_hash_import;
if (crypto_hash_alg_has_setkey(halg))
inst->alg.setkey = mcryptd_hash_setkey;
inst->alg.digest = mcryptd_hash_digest_enqueue;
err = ahash_register_instance(tmpl, inst);
if (err) {
crypto_drop_ahash(&ctx->spawn);
out_free_inst:
kfree(inst);
}
out_put_alg:
crypto_mod_put(alg);
return err;
}
static struct mcryptd_queue mqueue;
static int mcryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
{
struct crypto_attr_type *algt;
algt = crypto_get_attr_type(tb);
if (IS_ERR(algt))
return PTR_ERR(algt);
switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
case CRYPTO_ALG_TYPE_DIGEST:
return mcryptd_create_hash(tmpl, tb, &mqueue);
break;
}
return -EINVAL;
}
static void mcryptd_free(struct crypto_instance *inst)
{
struct mcryptd_instance_ctx *ctx = crypto_instance_ctx(inst);
struct hashd_instance_ctx *hctx = crypto_instance_ctx(inst);
switch (inst->alg.cra_flags & CRYPTO_ALG_TYPE_MASK) {
case CRYPTO_ALG_TYPE_AHASH:
crypto_drop_ahash(&hctx->spawn);
kfree(ahash_instance(inst));
return;
default:
crypto_drop_spawn(&ctx->spawn);
kfree(inst);
}
}
static struct crypto_template mcryptd_tmpl = {
.name = "mcryptd",
.create = mcryptd_create,
.free = mcryptd_free,
.module = THIS_MODULE,
};
struct mcryptd_ahash *mcryptd_alloc_ahash(const char *alg_name,
u32 type, u32 mask)
{
char mcryptd_alg_name[CRYPTO_MAX_ALG_NAME];
struct crypto_ahash *tfm;
if (snprintf(mcryptd_alg_name, CRYPTO_MAX_ALG_NAME,
"mcryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME)
return ERR_PTR(-EINVAL);
tfm = crypto_alloc_ahash(mcryptd_alg_name, type, mask);
if (IS_ERR(tfm))
return ERR_CAST(tfm);
if (tfm->base.__crt_alg->cra_module != THIS_MODULE) {
crypto_free_ahash(tfm);
return ERR_PTR(-EINVAL);
}
return __mcryptd_ahash_cast(tfm);
}
EXPORT_SYMBOL_GPL(mcryptd_alloc_ahash);
struct crypto_ahash *mcryptd_ahash_child(struct mcryptd_ahash *tfm)
{
struct mcryptd_hash_ctx *ctx = crypto_ahash_ctx(&tfm->base);
return ctx->child;
}
EXPORT_SYMBOL_GPL(mcryptd_ahash_child);
struct ahash_request *mcryptd_ahash_desc(struct ahash_request *req)
{
struct mcryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
return &rctx->areq;
}
EXPORT_SYMBOL_GPL(mcryptd_ahash_desc);
void mcryptd_free_ahash(struct mcryptd_ahash *tfm)
{
crypto_free_ahash(&tfm->base);
}
EXPORT_SYMBOL_GPL(mcryptd_free_ahash);
static int __init mcryptd_init(void)
{
int err, cpu;
struct mcryptd_flush_list *flist;
mcryptd_flist = alloc_percpu(struct mcryptd_flush_list);
for_each_possible_cpu(cpu) {
flist = per_cpu_ptr(mcryptd_flist, cpu);
INIT_LIST_HEAD(&flist->list);
mutex_init(&flist->lock);
}
err = mcryptd_init_queue(&mqueue, MCRYPTD_MAX_CPU_QLEN);
if (err) {
free_percpu(mcryptd_flist);
return err;
}
err = crypto_register_template(&mcryptd_tmpl);
if (err) {
mcryptd_fini_queue(&mqueue);
free_percpu(mcryptd_flist);
}
return err;
}
static void __exit mcryptd_exit(void)
{
mcryptd_fini_queue(&mqueue);
crypto_unregister_template(&mcryptd_tmpl);
free_percpu(mcryptd_flist);
}
subsys_initcall(mcryptd_init);
module_exit(mcryptd_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Software async multibuffer crypto daemon");
MODULE_ALIAS_CRYPTO("mcryptd");

View File

@ -385,14 +385,11 @@ static void crypto_morus1280_final(struct morus1280_state *state,
struct morus1280_block *tag_xor,
u64 assoclen, u64 cryptlen)
{
u64 assocbits = assoclen * 8;
u64 cryptbits = cryptlen * 8;
struct morus1280_block tmp;
unsigned int i;
tmp.words[0] = cpu_to_le64(assocbits);
tmp.words[1] = cpu_to_le64(cryptbits);
tmp.words[0] = assoclen * 8;
tmp.words[1] = cryptlen * 8;
tmp.words[2] = 0;
tmp.words[3] = 0;

View File

@ -384,21 +384,13 @@ static void crypto_morus640_final(struct morus640_state *state,
struct morus640_block *tag_xor,
u64 assoclen, u64 cryptlen)
{
u64 assocbits = assoclen * 8;
u64 cryptbits = cryptlen * 8;
u32 assocbits_lo = (u32)assocbits;
u32 assocbits_hi = (u32)(assocbits >> 32);
u32 cryptbits_lo = (u32)cryptbits;
u32 cryptbits_hi = (u32)(cryptbits >> 32);
struct morus640_block tmp;
unsigned int i;
tmp.words[0] = cpu_to_le32(assocbits_lo);
tmp.words[1] = cpu_to_le32(assocbits_hi);
tmp.words[2] = cpu_to_le32(cryptbits_lo);
tmp.words[3] = cpu_to_le32(cryptbits_hi);
tmp.words[0] = lower_32_bits(assoclen * 8);
tmp.words[1] = upper_32_bits(assoclen * 8);
tmp.words[2] = lower_32_bits(cryptlen * 8);
tmp.words[3] = upper_32_bits(cryptlen * 8);
for (i = 0; i < MORUS_BLOCK_WORDS; i++)
state->s[4].words[i] ^= state->s[0].words[i];

225
crypto/ofb.c Normal file
View File

@ -0,0 +1,225 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OFB: Output FeedBack mode
*
* Copyright (C) 2018 ARM Limited or its affiliates.
* All rights reserved.
*
* Based loosely on public domain code gleaned from libtomcrypt
* (https://github.com/libtom/libtomcrypt).
*/
#include <crypto/algapi.h>
#include <crypto/internal/skcipher.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
struct crypto_ofb_ctx {
struct crypto_cipher *child;
int cnt;
};
static int crypto_ofb_setkey(struct crypto_skcipher *parent, const u8 *key,
unsigned int keylen)
{
struct crypto_ofb_ctx *ctx = crypto_skcipher_ctx(parent);
struct crypto_cipher *child = ctx->child;
int err;
crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_cipher_set_flags(child, crypto_skcipher_get_flags(parent) &
CRYPTO_TFM_REQ_MASK);
err = crypto_cipher_setkey(child, key, keylen);
crypto_skcipher_set_flags(parent, crypto_cipher_get_flags(child) &
CRYPTO_TFM_RES_MASK);
return err;
}
static int crypto_ofb_encrypt_segment(struct crypto_ofb_ctx *ctx,
struct skcipher_walk *walk,
struct crypto_cipher *tfm)
{
int bsize = crypto_cipher_blocksize(tfm);
int nbytes = walk->nbytes;
u8 *src = walk->src.virt.addr;
u8 *dst = walk->dst.virt.addr;
u8 *iv = walk->iv;
do {
if (ctx->cnt == bsize) {
if (nbytes < bsize)
break;
crypto_cipher_encrypt_one(tfm, iv, iv);
ctx->cnt = 0;
}
*dst = *src ^ iv[ctx->cnt];
src++;
dst++;
ctx->cnt++;
} while (--nbytes);
return nbytes;
}
static int crypto_ofb_encrypt(struct skcipher_request *req)
{
struct skcipher_walk walk;
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
unsigned int bsize;
struct crypto_ofb_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_cipher *child = ctx->child;
int ret = 0;
bsize = crypto_cipher_blocksize(child);
ctx->cnt = bsize;
ret = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes) {
ret = crypto_ofb_encrypt_segment(ctx, &walk, child);
ret = skcipher_walk_done(&walk, ret);
}
return ret;
}
/* OFB encrypt and decrypt are identical */
static int crypto_ofb_decrypt(struct skcipher_request *req)
{
return crypto_ofb_encrypt(req);
}
static int crypto_ofb_init_tfm(struct crypto_skcipher *tfm)
{
struct skcipher_instance *inst = skcipher_alg_instance(tfm);
struct crypto_spawn *spawn = skcipher_instance_ctx(inst);
struct crypto_ofb_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_cipher *cipher;
cipher = crypto_spawn_cipher(spawn);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
ctx->child = cipher;
return 0;
}
static void crypto_ofb_exit_tfm(struct crypto_skcipher *tfm)
{
struct crypto_ofb_ctx *ctx = crypto_skcipher_ctx(tfm);
crypto_free_cipher(ctx->child);
}
static void crypto_ofb_free(struct skcipher_instance *inst)
{
crypto_drop_skcipher(skcipher_instance_ctx(inst));
kfree(inst);
}
static int crypto_ofb_create(struct crypto_template *tmpl, struct rtattr **tb)
{
struct skcipher_instance *inst;
struct crypto_attr_type *algt;
struct crypto_spawn *spawn;
struct crypto_alg *alg;
u32 mask;
int err;
err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER);
if (err)
return err;
inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
if (!inst)
return -ENOMEM;
algt = crypto_get_attr_type(tb);
err = PTR_ERR(algt);
if (IS_ERR(algt))
goto err_free_inst;
mask = CRYPTO_ALG_TYPE_MASK |
crypto_requires_off(algt->type, algt->mask,
CRYPTO_ALG_NEED_FALLBACK);
alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER, mask);
err = PTR_ERR(alg);
if (IS_ERR(alg))
goto err_free_inst;
spawn = skcipher_instance_ctx(inst);
err = crypto_init_spawn(spawn, alg, skcipher_crypto_instance(inst),
CRYPTO_ALG_TYPE_MASK);
crypto_mod_put(alg);
if (err)
goto err_free_inst;
err = crypto_inst_setname(skcipher_crypto_instance(inst), "ofb", alg);
if (err)
goto err_drop_spawn;
inst->alg.base.cra_priority = alg->cra_priority;
inst->alg.base.cra_blocksize = alg->cra_blocksize;
inst->alg.base.cra_alignmask = alg->cra_alignmask;
/* We access the data as u32s when xoring. */
inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
inst->alg.ivsize = alg->cra_blocksize;
inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
inst->alg.base.cra_ctxsize = sizeof(struct crypto_ofb_ctx);
inst->alg.init = crypto_ofb_init_tfm;
inst->alg.exit = crypto_ofb_exit_tfm;
inst->alg.setkey = crypto_ofb_setkey;
inst->alg.encrypt = crypto_ofb_encrypt;
inst->alg.decrypt = crypto_ofb_decrypt;
inst->free = crypto_ofb_free;
err = skcipher_register_instance(tmpl, inst);
if (err)
goto err_drop_spawn;
out:
return err;
err_drop_spawn:
crypto_drop_spawn(spawn);
err_free_inst:
kfree(inst);
goto out;
}
static struct crypto_template crypto_ofb_tmpl = {
.name = "ofb",
.create = crypto_ofb_create,
.module = THIS_MODULE,
};
static int __init crypto_ofb_module_init(void)
{
return crypto_register_template(&crypto_ofb_tmpl);
}
static void __exit crypto_ofb_module_exit(void)
{
crypto_unregister_template(&crypto_ofb_tmpl);
}
module_init(crypto_ofb_module_init);
module_exit(crypto_ofb_module_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("OFB block cipher algorithm");
MODULE_ALIAS_CRYPTO("ofb");

View File

@ -50,6 +50,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
}
err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
crypto_stat_rng_seed(tfm, err);
out:
kzfree(buf);
return err;

View File

@ -261,15 +261,6 @@ static int pkcs1pad_encrypt(struct akcipher_request *req)
pkcs1pad_sg_set_buf(req_ctx->in_sg, req_ctx->in_buf,
ctx->key_size - 1 - req->src_len, req->src);
req_ctx->out_buf = kmalloc(ctx->key_size, GFP_KERNEL);
if (!req_ctx->out_buf) {
kfree(req_ctx->in_buf);
return -ENOMEM;
}
pkcs1pad_sg_set_buf(req_ctx->out_sg, req_ctx->out_buf,
ctx->key_size, NULL);
akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
pkcs1pad_encrypt_sign_complete_cb, req);

View File

@ -73,9 +73,9 @@ static int seqiv_aead_encrypt(struct aead_request *req)
info = req->iv;
if (req->src != req->dst) {
SKCIPHER_REQUEST_ON_STACK(nreq, ctx->sknull);
SYNC_SKCIPHER_REQUEST_ON_STACK(nreq, ctx->sknull);
skcipher_request_set_tfm(nreq, ctx->sknull);
skcipher_request_set_sync_tfm(nreq, ctx->sknull);
skcipher_request_set_callback(nreq, req->base.flags,
NULL, NULL);
skcipher_request_set_crypt(nreq, req->src, req->dst,

View File

@ -73,13 +73,6 @@ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
}
EXPORT_SYMBOL_GPL(crypto_shash_setkey);
static inline unsigned int shash_align_buffer_size(unsigned len,
unsigned long mask)
{
typedef u8 __aligned_largest u8_aligned;
return len + (mask & ~(__alignof__(u8_aligned) - 1));
}
static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
@ -88,11 +81,17 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
unsigned long alignmask = crypto_shash_alignmask(tfm);
unsigned int unaligned_len = alignmask + 1 -
((unsigned long)data & alignmask);
u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
__aligned_largest;
/*
* We cannot count on __aligned() working for large values:
* https://patchwork.kernel.org/patch/9507697/
*/
u8 ubuf[MAX_ALGAPI_ALIGNMASK * 2];
u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
int err;
if (WARN_ON(buf + unaligned_len > ubuf + sizeof(ubuf)))
return -EINVAL;
if (unaligned_len > len)
unaligned_len = len;
@ -124,11 +123,17 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
unsigned long alignmask = crypto_shash_alignmask(tfm);
struct shash_alg *shash = crypto_shash_alg(tfm);
unsigned int ds = crypto_shash_digestsize(tfm);
u8 ubuf[shash_align_buffer_size(ds, alignmask)]
__aligned_largest;
/*
* We cannot count on __aligned() working for large values:
* https://patchwork.kernel.org/patch/9507697/
*/
u8 ubuf[MAX_ALGAPI_ALIGNMASK + HASH_MAX_DIGESTSIZE];
u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
int err;
if (WARN_ON(buf + ds > ubuf + sizeof(ubuf)))
return -EINVAL;
err = shash->final(desc, buf);
if (err)
goto out;
@ -458,9 +463,9 @@ static int shash_prepare_alg(struct shash_alg *alg)
{
struct crypto_alg *base = &alg->base;
if (alg->digestsize > PAGE_SIZE / 8 ||
alg->descsize > PAGE_SIZE / 8 ||
alg->statesize > PAGE_SIZE / 8)
if (alg->digestsize > HASH_MAX_DIGESTSIZE ||
alg->descsize > HASH_MAX_DESCSIZE ||
alg->statesize > HASH_MAX_STATESIZE)
return -EINVAL;
base->cra_type = &crypto_shash_type;

View File

@ -949,6 +949,30 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
}
EXPORT_SYMBOL_GPL(crypto_alloc_skcipher);
struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
const char *alg_name, u32 type, u32 mask)
{
struct crypto_skcipher *tfm;
/* Only sync algorithms allowed. */
mask |= CRYPTO_ALG_ASYNC;
tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type2, type, mask);
/*
* Make sure we do not allocate something that might get used with
* an on-stack request: check the request size.
*/
if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) >
MAX_SYNC_SKCIPHER_REQSIZE)) {
crypto_free_skcipher(tfm);
return ERR_PTR(-EINVAL);
}
return (struct crypto_sync_skcipher *)tfm;
}
EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher);
int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask)
{
return crypto_type_has_alg(alg_name, &crypto_skcipher_type2,

View File

@ -1,307 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Speck: a lightweight block cipher
*
* Copyright (c) 2018 Google, Inc
*
* Speck has 10 variants, including 5 block sizes. For now we only implement
* the variants Speck128/128, Speck128/192, Speck128/256, Speck64/96, and
* Speck64/128. Speck${B}/${K} denotes the variant with a block size of B bits
* and a key size of K bits. The Speck128 variants are believed to be the most
* secure variants, and they use the same block size and key sizes as AES. The
* Speck64 variants are less secure, but on 32-bit processors are usually
* faster. The remaining variants (Speck32, Speck48, and Speck96) are even less
* secure and/or not as well suited for implementation on either 32-bit or
* 64-bit processors, so are omitted.
*
* Reference: "The Simon and Speck Families of Lightweight Block Ciphers"
* https://eprint.iacr.org/2013/404.pdf
*
* In a correspondence, the Speck designers have also clarified that the words
* should be interpreted in little-endian format, and the words should be
* ordered such that the first word of each block is 'y' rather than 'x', and
* the first key word (rather than the last) becomes the first round key.
*/
#include <asm/unaligned.h>
#include <crypto/speck.h>
#include <linux/bitops.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
/* Speck128 */
static __always_inline void speck128_round(u64 *x, u64 *y, u64 k)
{
*x = ror64(*x, 8);
*x += *y;
*x ^= k;
*y = rol64(*y, 3);
*y ^= *x;
}
static __always_inline void speck128_unround(u64 *x, u64 *y, u64 k)
{
*y ^= *x;
*y = ror64(*y, 3);
*x ^= k;
*x -= *y;
*x = rol64(*x, 8);
}
void crypto_speck128_encrypt(const struct speck128_tfm_ctx *ctx,
u8 *out, const u8 *in)
{
u64 y = get_unaligned_le64(in);
u64 x = get_unaligned_le64(in + 8);
int i;
for (i = 0; i < ctx->nrounds; i++)
speck128_round(&x, &y, ctx->round_keys[i]);
put_unaligned_le64(y, out);
put_unaligned_le64(x, out + 8);
}
EXPORT_SYMBOL_GPL(crypto_speck128_encrypt);
static void speck128_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
crypto_speck128_encrypt(crypto_tfm_ctx(tfm), out, in);
}
void crypto_speck128_decrypt(const struct speck128_tfm_ctx *ctx,
u8 *out, const u8 *in)
{
u64 y = get_unaligned_le64(in);
u64 x = get_unaligned_le64(in + 8);
int i;
for (i = ctx->nrounds - 1; i >= 0; i--)
speck128_unround(&x, &y, ctx->round_keys[i]);
put_unaligned_le64(y, out);
put_unaligned_le64(x, out + 8);
}
EXPORT_SYMBOL_GPL(crypto_speck128_decrypt);
static void speck128_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
crypto_speck128_decrypt(crypto_tfm_ctx(tfm), out, in);
}
int crypto_speck128_setkey(struct speck128_tfm_ctx *ctx, const u8 *key,
unsigned int keylen)
{
u64 l[3];
u64 k;
int i;
switch (keylen) {
case SPECK128_128_KEY_SIZE:
k = get_unaligned_le64(key);
l[0] = get_unaligned_le64(key + 8);
ctx->nrounds = SPECK128_128_NROUNDS;
for (i = 0; i < ctx->nrounds; i++) {
ctx->round_keys[i] = k;
speck128_round(&l[0], &k, i);
}
break;
case SPECK128_192_KEY_SIZE:
k = get_unaligned_le64(key);
l[0] = get_unaligned_le64(key + 8);
l[1] = get_unaligned_le64(key + 16);
ctx->nrounds = SPECK128_192_NROUNDS;
for (i = 0; i < ctx->nrounds; i++) {
ctx->round_keys[i] = k;
speck128_round(&l[i % 2], &k, i);
}
break;
case SPECK128_256_KEY_SIZE:
k = get_unaligned_le64(key);
l[0] = get_unaligned_le64(key + 8);
l[1] = get_unaligned_le64(key + 16);
l[2] = get_unaligned_le64(key + 24);
ctx->nrounds = SPECK128_256_NROUNDS;
for (i = 0; i < ctx->nrounds; i++) {
ctx->round_keys[i] = k;
speck128_round(&l[i % 3], &k, i);
}
break;
default:
return -EINVAL;
}
return 0;
}
EXPORT_SYMBOL_GPL(crypto_speck128_setkey);
static int speck128_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
return crypto_speck128_setkey(crypto_tfm_ctx(tfm), key, keylen);
}
/* Speck64 */
static __always_inline void speck64_round(u32 *x, u32 *y, u32 k)
{
*x = ror32(*x, 8);
*x += *y;
*x ^= k;
*y = rol32(*y, 3);
*y ^= *x;
}
static __always_inline void speck64_unround(u32 *x, u32 *y, u32 k)
{
*y ^= *x;
*y = ror32(*y, 3);
*x ^= k;
*x -= *y;
*x = rol32(*x, 8);
}
void crypto_speck64_encrypt(const struct speck64_tfm_ctx *ctx,
u8 *out, const u8 *in)
{
u32 y = get_unaligned_le32(in);
u32 x = get_unaligned_le32(in + 4);
int i;
for (i = 0; i < ctx->nrounds; i++)
speck64_round(&x, &y, ctx->round_keys[i]);
put_unaligned_le32(y, out);
put_unaligned_le32(x, out + 4);
}
EXPORT_SYMBOL_GPL(crypto_speck64_encrypt);
static void speck64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
crypto_speck64_encrypt(crypto_tfm_ctx(tfm), out, in);
}
void crypto_speck64_decrypt(const struct speck64_tfm_ctx *ctx,
u8 *out, const u8 *in)
{
u32 y = get_unaligned_le32(in);
u32 x = get_unaligned_le32(in + 4);
int i;
for (i = ctx->nrounds - 1; i >= 0; i--)
speck64_unround(&x, &y, ctx->round_keys[i]);
put_unaligned_le32(y, out);
put_unaligned_le32(x, out + 4);
}
EXPORT_SYMBOL_GPL(crypto_speck64_decrypt);
static void speck64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
crypto_speck64_decrypt(crypto_tfm_ctx(tfm), out, in);
}
int crypto_speck64_setkey(struct speck64_tfm_ctx *ctx, const u8 *key,
unsigned int keylen)
{
u32 l[3];
u32 k;
int i;
switch (keylen) {
case SPECK64_96_KEY_SIZE:
k = get_unaligned_le32(key);
l[0] = get_unaligned_le32(key + 4);
l[1] = get_unaligned_le32(key + 8);
ctx->nrounds = SPECK64_96_NROUNDS;
for (i = 0; i < ctx->nrounds; i++) {
ctx->round_keys[i] = k;
speck64_round(&l[i % 2], &k, i);
}
break;
case SPECK64_128_KEY_SIZE:
k = get_unaligned_le32(key);
l[0] = get_unaligned_le32(key + 4);
l[1] = get_unaligned_le32(key + 8);
l[2] = get_unaligned_le32(key + 12);
ctx->nrounds = SPECK64_128_NROUNDS;
for (i = 0; i < ctx->nrounds; i++) {
ctx->round_keys[i] = k;
speck64_round(&l[i % 3], &k, i);
}
break;
default:
return -EINVAL;
}
return 0;
}
EXPORT_SYMBOL_GPL(crypto_speck64_setkey);
static int speck64_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
return crypto_speck64_setkey(crypto_tfm_ctx(tfm), key, keylen);
}
/* Algorithm definitions */
static struct crypto_alg speck_algs[] = {
{
.cra_name = "speck128",
.cra_driver_name = "speck128-generic",
.cra_priority = 100,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = SPECK128_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct speck128_tfm_ctx),
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
.cia_min_keysize = SPECK128_128_KEY_SIZE,
.cia_max_keysize = SPECK128_256_KEY_SIZE,
.cia_setkey = speck128_setkey,
.cia_encrypt = speck128_encrypt,
.cia_decrypt = speck128_decrypt
}
}
}, {
.cra_name = "speck64",
.cra_driver_name = "speck64-generic",
.cra_priority = 100,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = SPECK64_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct speck64_tfm_ctx),
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
.cia_min_keysize = SPECK64_96_KEY_SIZE,
.cia_max_keysize = SPECK64_128_KEY_SIZE,
.cia_setkey = speck64_setkey,
.cia_encrypt = speck64_encrypt,
.cia_decrypt = speck64_decrypt
}
}
}
};
static int __init speck_module_init(void)
{
return crypto_register_algs(speck_algs, ARRAY_SIZE(speck_algs));
}
static void __exit speck_module_exit(void)
{
crypto_unregister_algs(speck_algs, ARRAY_SIZE(speck_algs));
}
module_init(speck_module_init);
module_exit(speck_module_exit);
MODULE_DESCRIPTION("Speck block cipher (generic)");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("speck128");
MODULE_ALIAS_CRYPTO("speck128-generic");
MODULE_ALIAS_CRYPTO("speck64");
MODULE_ALIAS_CRYPTO("speck64-generic");

View File

@ -76,8 +76,7 @@ static char *check[] = {
"cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea",
"khazad", "wp512", "wp384", "wp256", "tnepres", "xeta", "fcrypt",
"camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320",
"lzo", "cts", "zlib", "sha3-224", "sha3-256", "sha3-384", "sha3-512",
NULL
"lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512", NULL
};
static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };
@ -1103,6 +1102,9 @@ static void test_ahash_speed_common(const char *algo, unsigned int secs,
break;
}
if (speed[i].klen)
crypto_ahash_setkey(tfm, tvmem[0], speed[i].klen);
pr_info("test%3u "
"(%5u byte blocks,%5u bytes per update,%4u updates): ",
i, speed[i].blen, speed[i].plen, speed[i].blen / speed[i].plen);
@ -1733,6 +1735,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
ret += tcrypt_test("xts(aes)");
ret += tcrypt_test("ctr(aes)");
ret += tcrypt_test("rfc3686(ctr(aes))");
ret += tcrypt_test("ofb(aes)");
break;
case 11:
@ -1878,10 +1881,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
ret += tcrypt_test("ecb(seed)");
break;
case 44:
ret += tcrypt_test("zlib");
break;
case 45:
ret += tcrypt_test("rfc4309(ccm(aes))");
break;
@ -2033,6 +2032,8 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
break;
case 191:
ret += tcrypt_test("ecb(sm4)");
ret += tcrypt_test("cbc(sm4)");
ret += tcrypt_test("ctr(sm4)");
break;
case 200:
test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
@ -2282,6 +2283,20 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
num_mb);
break;
case 218:
test_cipher_speed("ecb(sm4)", ENCRYPT, sec, NULL, 0,
speed_template_16);
test_cipher_speed("ecb(sm4)", DECRYPT, sec, NULL, 0,
speed_template_16);
test_cipher_speed("cbc(sm4)", ENCRYPT, sec, NULL, 0,
speed_template_16);
test_cipher_speed("cbc(sm4)", DECRYPT, sec, NULL, 0,
speed_template_16);
test_cipher_speed("ctr(sm4)", ENCRYPT, sec, NULL, 0,
speed_template_16);
test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0,
speed_template_16);
break;
case 300:
if (alg) {
test_hash_speed(alg, sec, generic_hash_speed_template);

Some files were not shown because too many files have changed in this diff Show More