crypto: arm64/aes-ccm - Merge encrypt and decrypt tail handling

The encryption and decryption code paths are mostly identical, except for a small difference where the plaintext input into the MAC is taken from either the input or the output block. We can factor this in quite easily using a vector bit select, and a few additional XORs, without the need for branches. This way, we can use the same tail handling logic on the encrypt and decrypt code paths, allowing further consolidation of the asm helpers in a subsequent patch. (In the main loop, adding just a handful of ALU instructions results in a noticeable performance hit [around 5% on Apple M2], so those routines are kept separate) Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2025-01-06 05:06:29 +00:00 · 2024-01-18 18:06:36 +01:00 · 2024-01-18 18:06:36 +01:00 · 7150528849
commit 7150528849
parent 565def1542
1 changed files with 13 additions and 13 deletions
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@ -77,7 +77,7 @@ CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 	aes_encrypt	v0, v1, w4

 	subs	w2, w2, #16
-	bmi	6f				/* partial block? */
+	bmi	ce_aes_ccm_crypt_tail
 	ld1	{v2.16b}, [x1], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
@ -93,8 +93,10 @@ CPU_LE(	rev	x8, x8			)
 	st1	{v0.16b}, [x5]			/* store mac */
 	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
 5:	ret
+	.endm

-6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail)
+	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */

 	add	x1, x1, w2, sxtw		/* rewind the input pointer (w2 < 0) */
@ -108,20 +110,16 @@ CPU_LE(	rev	x8, x8			)

 	ld1	{v2.16b}, [x1]			/* load a full block of input */
 	tbl	v1.16b, {v1.16b}, v7.16b	/* move keystream to end of register */
-	.if	\enc == 1
-	tbl	v7.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v7 */
-	eor	v2.16b, v2.16b, v1.16b		/* encrypt partial input block */
-	.else
-	eor	v2.16b, v2.16b, v1.16b		/* decrypt partial input block */
-	tbl	v7.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v7 */
-	.endif
-	eor	v0.16b, v0.16b, v7.16b		/* fold plaintext into mac */
-	tbx	v2.16b, {v6.16b}, v8.16b	/* insert output from previous iteration */
+	eor	v7.16b, v2.16b, v1.16b		/* encrypt partial input block */
+	bif	v2.16b, v7.16b, v22.16b		/* select plaintext */
+	tbx	v7.16b, {v6.16b}, v8.16b	/* insert output from previous iteration */
+	tbl	v2.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v2 */
+	eor	v0.16b, v0.16b, v2.16b		/* fold plaintext into mac */

 	st1	{v0.16b}, [x5]			/* store mac */
-	st1	{v2.16b}, [x0]			/* store output block */
+	st1	{v7.16b}, [x0]			/* store output block */
 	ret
-	.endm
+SYM_FUNC_END(ce_aes_ccm_crypt_tail)

 	/*
 	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
@ -132,10 +130,12 @@ CPU_LE(	rev	x8, x8			)
 	 * 			   u8 ctr[]);
 	 */
 SYM_FUNC_START(ce_aes_ccm_encrypt)
+	movi	v22.16b, #255
 	aes_ccm_do_crypt	1
 SYM_FUNC_END(ce_aes_ccm_encrypt)

 SYM_FUNC_START(ce_aes_ccm_decrypt)
+	movi	v22.16b, #0
 	aes_ccm_do_crypt	0
 SYM_FUNC_END(ce_aes_ccm_decrypt)