Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: "API: - Try to catch hash output overrun in testmgr - Introduce walksize attribute for batched walking - Make crypto_xor() and crypto_inc() alignment agnostic Algorithms: - Add time-invariant AES algorithm - Add standalone CBCMAC algorithm Drivers: - Add NEON acclerated chacha20 on ARM/ARM64 - Expose AES-CTR as synchronous skcipher on ARM64 - Add scalar AES implementation on ARM64 - Improve scalar AES implementation on ARM - Improve NEON AES implementation on ARM/ARM64 - Merge CRC32 and PMULL instruction based drivers on ARM64 - Add NEON acclerated CBCMAC/CMAC/XCBC AES on ARM64 - Add IPsec AUTHENC implementation in atmel - Add Support for Octeon-tx CPT Engine - Add Broadcom SPU driver - Add MediaTek driver" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (142 commits) crypto: xts - Add ECB dependency crypto: cavium - switch to pci_alloc_irq_vectors crypto: cavium - switch to pci_alloc_irq_vectors crypto: cavium - remove dead MSI-X related define crypto: brcm - Avoid double free in ahash_finup() crypto: cavium - fix Kconfig dependencies crypto: cavium - cpt_bind_vq_to_grp could return an error code crypto: doc - fix typo hwrng: omap - update Kconfig help description crypto: ccm - drop unnecessary minimum 32-bit alignment crypto: ccm - honour alignmask of subordinate MAC cipher crypto: caam - fix state buffer DMA (un)mapping crypto: caam - abstract ahash request double buffering crypto: caam - fix error path for ctx_dma mapping failure crypto: caam - fix DMA API leaks for multiple setkey() calls crypto: caam - don't dma_map key for hash algorithms crypto: caam - use dma_map_sg() return code crypto: caam - replace sg_count() with sg_nents_for_len() crypto: caam - check sg_count() return value crypto: caam - fix HW S/G in ablkcipher_giv_edesc_alloc() ..
2025-01-09 07:23:14 +00:00 · 2017-02-23 09:54:19 -08:00 · 2017-02-23 09:54:19 -08:00 · 5bcbe22ca4
commit 5bcbe22ca4
parent 1db934a5b7 12cb3a1c41
187 changed files with 26974 additions and 9589 deletions
--- a/Documentation/crypto/api-digest.rst
+++ b/Documentation/crypto/api-digest.rst
@ -14,7 +14,7 @@ Asynchronous Message Digest API
   :doc: Asynchronous Message Digest API

 .. kernel-doc:: include/crypto/hash.h
-   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import
+   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_statesize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import

 Asynchronous Hash Request Handle
 --------------------------------
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@ -59,4 +59,4 @@ Synchronous Block Cipher API - Deprecated
   :doc: Synchronous Block Cipher API

 .. kernel-doc:: include/linux/crypto.h
-   :functions: crypto_alloc_blkcipher rypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
+   :functions: crypto_alloc_blkcipher crypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
--- a/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
@ -0,0 +1,22 @@
+The Broadcom Secure Processing Unit (SPU) hardware supports symmetric
+cryptographic offload for Broadcom SoCs. A SoC may have multiple SPU hardware
+blocks.
+
+Required properties:
+- compatible: Should be one of the following:
+  brcm,spum-crypto - for devices with SPU-M hardware
+  brcm,spu2-crypto - for devices with SPU2 hardware
+  brcm,spu2-v2-crypto - for devices with enhanced SPU2 hardware features like SHA3
+  and Rabin Fingerprint support
+  brcm,spum-nsp-crypto - for the Northstar Plus variant of the SPU-M hardware
+
+- reg: Should contain SPU registers location and length.
+- mboxes: The mailbox channel to be used to communicate with the SPU.
+  Mailbox channels correspond to DMA rings on the device.
+
+Example:
+	crypto@612d0000 {
+		compatible = "brcm,spum-crypto";
+		reg = <0 0x612d0000 0 0x900>;
+		mboxes = <&pdc0 0>;
+	};
--- a/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
@ -0,0 +1,27 @@
+MediaTek cryptographic accelerators
+
+Required properties:
+- compatible: Should be "mediatek,eip97-crypto"
+- reg: Address and length of the register set for the device
+- interrupts: Should contain the five crypto engines interrupts in numeric
+	order. These are global system and four descriptor rings.
+- clocks: the clock used by the core
+- clock-names: the names of the clock listed in the clocks property. These are
+	"ethif", "cryp"
+- power-domains: Must contain a reference to the PM domain.
+
+
+Example:
+	crypto: crypto@1b240000 {
+		compatible = "mediatek,eip97-crypto";
+		reg = <0 0x1b240000 0 0x20000>;
+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 83 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 84 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 91 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 97 IRQ_TYPE_LEVEL_LOW>;
+		clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
+			 <&ethsys CLK_ETHSYS_CRYPTO>;
+		clock-names = "ethif","cryp";
+		power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
+	};
--- a/7
+++ b/7
@ -3031,6 +3031,13 @@ W:     http://www.cavium.com
 S:     Supported
 F:     drivers/net/ethernet/cavium/liquidio/

+CAVIUM OCTEON-TX CRYPTO DRIVER
+M:	George Cherian <george.cherian@cavium.com>
+L:	linux-crypto@vger.kernel.org
+W:	http://www.cavium.com
+S:	Supported
+F:	drivers/crypto/cavium/cpt/
+
 CC2520 IEEE-802.15.4 RADIO DRIVER
 M:	Varka Bhadram <varkabhadram@gmail.com>
 L:	linux-wpan@vger.kernel.org
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@ -62,35 +62,18 @@ config CRYPTO_SHA512_ARM
 	  using optimized ARM assembler and NEON, when available.

 config CRYPTO_AES_ARM
-	tristate "AES cipher algorithms (ARM-asm)"
-	depends on ARM
+	tristate "Scalar AES cipher for ARM"
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES
 	help
 	  Use optimized AES assembler routines for ARM platforms.

-	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
-	  algorithm.
-
-	  Rijndael appears to be consistently a very good performer in
-	  both hardware and software across a wide range of computing
-	  environments regardless of its use in feedback or non-feedback
-	  modes. Its key setup time is excellent, and its key agility is
-	  good. Rijndael's very low memory requirements make it very well
-	  suited for restricted-space environments, in which it also
-	  demonstrates excellent performance. Rijndael's operations are
-	  among the easiest to defend against power and timing attacks.
-
-	  The AES specifies three key sizes: 128, 192 and 256 bits
-
-	  See <http://csrc.nist.gov/encryption/aes/> for more information.
-
 config CRYPTO_AES_ARM_BS
 	tristate "Bit sliced AES using NEON instructions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_AES_ARM
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
+	select CRYPTO_AES_ARM
 	help
 	  Use a faster and more secure NEON based implementation of AES in CBC,
 	  CTR and XTS modes
@ -130,4 +113,10 @@ config CRYPTO_CRC32_ARM_CE
 	depends on KERNEL_MODE_NEON && CRC32
 	select CRYPTO_HASH

+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
 endif
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o

 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@ -26,8 +27,8 @@ $(warning $(ce-obj-y) $(ce-obj-m))
 endif
 endif

-aes-arm-y	:= aes-armv4.o aes_glue.o
-aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
+aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
+aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
 sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
@ -40,17 +41,15 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o

 quiet_cmd_perl = PERL    $@
      cmd_perl = $(PERL) $(<) > $(@)

-$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
-	$(call cmd,perl)
-
 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 	$(call cmd,perl)

 $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
 	$(call cmd,perl)

-.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S $(obj)/sha512-core.S
+.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
--- a/arch/arm/crypto/aes-armv4.S
+++ b/arch/arm/crypto/aes-armv4.S
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@ -169,19 +169,19 @@ ENTRY(ce_aes_ecb_encrypt)
 .Lecbencloop3x:
 	subs		r4, r4, #3
 	bmi		.Lecbenc1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	bl		aes_encrypt_3x
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lecbencloop3x
 .Lecbenc1x:
 	adds		r4, r4, #3
 	beq		.Lecbencout
 .Lecbencloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	bl		aes_encrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lecbencloop
 .Lecbencout:
@ -195,19 +195,19 @@ ENTRY(ce_aes_ecb_decrypt)
 .Lecbdecloop3x:
 	subs		r4, r4, #3
 	bmi		.Lecbdec1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	bl		aes_decrypt_3x
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lecbdecloop3x
 .Lecbdec1x:
 	adds		r4, r4, #3
 	beq		.Lecbdecout
 .Lecbdecloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	bl		aes_decrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lecbdecloop
 .Lecbdecout:
@ -226,10 +226,10 @@ ENTRY(ce_aes_cbc_encrypt)
 	vld1.8		{q0}, [r5]
 	prepare_key	r2, r3
 .Lcbcencloop:
-	vld1.8		{q1}, [r1, :64]!	@ get next pt block
+	vld1.8		{q1}, [r1]!		@ get next pt block
 	veor		q0, q0, q1		@ ..and xor with iv
 	bl		aes_encrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lcbcencloop
 	vst1.8		{q0}, [r5]
@ -244,8 +244,8 @@ ENTRY(ce_aes_cbc_decrypt)
 .Lcbcdecloop3x:
 	subs		r4, r4, #3
 	bmi		.Lcbcdec1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	vmov		q3, q0
 	vmov		q4, q1
 	vmov		q5, q2
@ -254,19 +254,19 @@ ENTRY(ce_aes_cbc_decrypt)
 	veor		q1, q1, q3
 	veor		q2, q2, q4
 	vmov		q6, q5
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lcbcdecloop3x
 .Lcbcdec1x:
 	adds		r4, r4, #3
 	beq		.Lcbcdecout
 	vmov		q15, q14		@ preserve last round key
 .Lcbcdecloop:
-	vld1.8		{q0}, [r1, :64]!	@ get next ct block
+	vld1.8		{q0}, [r1]!		@ get next ct block
 	veor		q14, q15, q6		@ combine prev ct with last key
 	vmov		q6, q0
 	bl		aes_decrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
@ -300,15 +300,15 @@ ENTRY(ce_aes_ctr_encrypt)
 	rev		ip, r6
 	add		r6, r6, #1
 	vmov		s11, ip
-	vld1.8		{q3-q4}, [r1, :64]!
-	vld1.8		{q5}, [r1, :64]!
+	vld1.8		{q3-q4}, [r1]!
+	vld1.8		{q5}, [r1]!
 	bl		aes_encrypt_3x
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
 	rev		ip, r6
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	vmov		s27, ip
 	b		.Lctrloop3x
 .Lctr1x:
@ -318,10 +318,10 @@ ENTRY(ce_aes_ctr_encrypt)
 	vmov		q0, q6
 	bl		aes_encrypt
 	subs		r4, r4, #1
-	bmi		.Lctrhalfblock		@ blocks < 0 means 1/2 block
-	vld1.8		{q3}, [r1, :64]!
+	bmi		.Lctrtailblock		@ blocks < 0 means tail block
+	vld1.8		{q3}, [r1]!
 	veor		q3, q0, q3
-	vst1.8		{q3}, [r0, :64]!
+	vst1.8		{q3}, [r0]!

 	adds		r6, r6, #1		@ increment BE ctr
 	rev		ip, r6
@ -333,10 +333,8 @@ ENTRY(ce_aes_ctr_encrypt)
 	vst1.8		{q6}, [r5]
 	pop		{r4-r6, pc}

-.Lctrhalfblock:
-	vld1.8		{d1}, [r1, :64]
-	veor		d0, d0, d1
-	vst1.8		{d0}, [r0, :64]
+.Lctrtailblock:
+	vst1.8		{q0}, [r0, :64]		@ return just the key stream
 	pop		{r4-r6, pc}

 .Lctrcarry:
@ -405,8 +403,8 @@ ENTRY(ce_aes_xts_encrypt)
 .Lxtsenc3x:
 	subs		r4, r4, #3
 	bmi		.Lxtsenc1x
-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 pt blocks
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
+	vld1.8		{q2}, [r1]!
 	next_tweak	q4, q3, q7, q6
 	veor		q0, q0, q3
 	next_tweak	q5, q4, q7, q6
@ -416,8 +414,8 @@ ENTRY(ce_aes_xts_encrypt)
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 ct blocks
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
+	vst1.8		{q2}, [r0]!
 	vmov		q3, q5
 	teq		r4, #0
 	beq		.Lxtsencout
@ -426,11 +424,11 @@ ENTRY(ce_aes_xts_encrypt)
 	adds		r4, r4, #3
 	beq		.Lxtsencout
 .Lxtsencloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	veor		q0, q0, q3
 	bl		aes_encrypt
 	veor		q0, q0, q3
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	beq		.Lxtsencout
 	next_tweak	q3, q3, q7, q6
@ -456,8 +454,8 @@ ENTRY(ce_aes_xts_decrypt)
 .Lxtsdec3x:
 	subs		r4, r4, #3
 	bmi		.Lxtsdec1x
-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 ct blocks
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
+	vld1.8		{q2}, [r1]!
 	next_tweak	q4, q3, q7, q6
 	veor		q0, q0, q3
 	next_tweak	q5, q4, q7, q6
@ -467,8 +465,8 @@ ENTRY(ce_aes_xts_decrypt)
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 pt blocks
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
+	vst1.8		{q2}, [r0]!
 	vmov		q3, q5
 	teq		r4, #0
 	beq		.Lxtsdecout
@ -477,12 +475,12 @@ ENTRY(ce_aes_xts_decrypt)
 	adds		r4, r4, #3
 	beq		.Lxtsdecout
 .Lxtsdecloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	veor		q0, q0, q3
 	add		ip, r2, #32		@ 3rd round key
 	bl		aes_decrypt
 	veor		q0, q0, q3
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	beq		.Lxtsdecout
 	next_tweak	q3, q3, q7, q6
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@ -278,14 +278,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 		u8 *tsrc = walk.src.virt.addr;

 		/*
-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
-		 * to tell aes_ctr_encrypt() to only read half a block.
+		 * Tell aes_ctr_encrypt() to process a tail block.
 		 */
-		blocks = (nbytes <= 8) ? -1 : 1;
+		blocks = -1;

-		ce_aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc,
+		ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
 				   num_rounds(ctx), blocks, walk.iv);
-		memcpy(tdst, tail, nbytes);
+		if (tdst != tsrc)
+			memcpy(tdst, tsrc, nbytes);
+		crypto_xor(tdst, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
@ -345,7 +346,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -361,7 +361,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -378,7 +377,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -396,7 +394,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@ -0,0 +1,179 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		5
+
+	rk		.req	r0
+	rounds		.req	r1
+	in		.req	r2
+	out		.req	r3
+	ttab		.req	ip
+
+	t0		.req	lr
+	t1		.req	r2
+	t2		.req	r3
+
+	.macro		__select, out, in, idx
+	.if		__LINUX_ARM_ARCH__ < 7
+	and		\out, \in, #0xff << (8 * \idx)
+	.else
+	ubfx		\out, \in, #(8 * \idx), #8
+	.endif
+	.endm
+
+	.macro		__load, out, in, idx
+	.if		__LINUX_ARM_ARCH__ < 7 && \idx > 0
+	ldr		\out, [ttab, \in, lsr #(8 * \idx) - 2]
+	.else
+	ldr		\out, [ttab, \in, lsl #2]
+	.endif
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc
+	__select	\out0, \in0, 0
+	__select	t0, \in1, 1
+	__load		\out0, \out0, 0
+	__load		t0, t0, 1
+
+	.if		\enc
+	__select	\out1, \in1, 0
+	__select	t1, \in2, 1
+	.else
+	__select	\out1, \in3, 0
+	__select	t1, \in0, 1
+	.endif
+	__load		\out1, \out1, 0
+	__select	t2, \in2, 2
+	__load		t1, t1, 1
+	__load		t2, t2, 2
+
+	eor		\out0, \out0, t0, ror #24
+
+	__select	t0, \in3, 3
+	.if		\enc
+	__select	\t3, \in3, 2
+	__select	\t4, \in0, 3
+	.else
+	__select	\t3, \in1, 2
+	__select	\t4, \in2, 3
+	.endif
+	__load		\t3, \t3, 2
+	__load		t0, t0, 3
+	__load		\t4, \t4, 3
+
+	eor		\out1, \out1, t1, ror #24
+	eor		\out0, \out0, t2, ror #16
+	ldm		rk!, {t1, t2}
+	eor		\out1, \out1, \t3, ror #16
+	eor		\out0, \out0, t0, ror #8
+	eor		\out1, \out1, \t4, ror #8
+	eor		\out0, \out0, t1
+	eor		\out1, \out1, t2
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+	.endm
+
+	.macro		__rev, out, in
+	.if		__LINUX_ARM_ARCH__ < 6
+	lsl		t0, \in, #24
+	and		t1, \in, #0xff00
+	and		t2, \in, #0xff0000
+	orr		\out, t0, \in, lsr #24
+	orr		\out, \out, t1, lsl #8
+	orr		\out, \out, t2, lsr #8
+	.else
+	rev		\out, \in
+	.endif
+	.endm
+
+	.macro		__adrl, out, sym, c
+	.if		__LINUX_ARM_ARCH__ < 7
+	ldr\c		\out, =\sym
+	.else
+	movw\c		\out, #:lower16:\sym
+	movt\c		\out, #:upper16:\sym
+	.endif
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab
+	push		{r3-r11, lr}
+
+	ldr		r4, [in]
+	ldr		r5, [in, #4]
+	ldr		r6, [in, #8]
+	ldr		r7, [in, #12]
+
+	ldm		rk!, {r8-r11}
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	__rev		r4, r4
+	__rev		r5, r5
+	__rev		r6, r6
+	__rev		r7, r7
+#endif
+
+	eor		r4, r4, r8
+	eor		r5, r5, r9
+	eor		r6, r6, r10
+	eor		r7, r7, r11
+
+	__adrl		ttab, \ttab
+
+	tst		rounds, #2
+	bne		1f
+
+0:	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+
+1:	subs		rounds, rounds, #4
+	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	__adrl		ttab, \ltab, ls
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+	bhi		0b
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	__rev		r4, r4
+	__rev		r5, r5
+	__rev		r6, r6
+	__rev		r7, r7
+#endif
+
+	ldr		out, [sp]
+
+	str		r4, [out]
+	str		r5, [out, #4]
+	str		r6, [out, #8]
+	str		r7, [out, #12]
+
+	pop		{r3-r11, pc}
+
+	.align		3
+	.ltorg
+	.endm
+
+ENTRY(__aes_arm_encrypt)
+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
+ENDPROC(__aes_arm_encrypt)
+
+ENTRY(__aes_arm_decrypt)
+	do_crypt	iround, crypto_it_tab, crypto_il_tab
+ENDPROC(__aes_arm_decrypt)
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@ -0,0 +1,74 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_encrypt);
+
+asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-arm",
+	.cra_priority			= 200,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt,
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	.cra_alignmask			= 3,
+#endif
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for ARM");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@ -0,0 +1,406 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/cbc.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 ctr[], u8 final[]);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
+				  u8 out[]);
+
+struct aesbs_ctx {
+	int	rounds;
+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
+};
+
+struct aesbs_cbc_ctx {
+	struct aesbs_ctx	key;
+	u32			enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+	struct aesbs_ctx	key;
+	u32			twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 4;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+		   ctx->rounds, blocks);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->key.rounds = 6 + key_len / 4;
+
+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+	return crypto_cbc_encrypt_walk(req, cbc_encrypt_one);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->key.rk, ctx->key.rounds, blocks,
+				  walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u8 buf[AES_BLOCK_SIZE];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+			final = NULL;
+		}
+
+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+		if (final) {
+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+			if (dst != src)
+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+
+			err = skcipher_walk_done(&walk, 0);
+			break;
+		}
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = xts_verify_key(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	key_len /= 2;
+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	if (err)
+		return err;
+
+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+	return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+		   ctx->key.rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+	.base.cra_name		= "__ecb(aes)",
+	.base.cra_driver_name	= "__ecb-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ecb_encrypt,
+	.decrypt		= ecb_decrypt,
+}, {
+	.base.cra_name		= "__cbc(aes)",
+	.base.cra_driver_name	= "__cbc-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_cbc_setkey,
+	.encrypt		= cbc_encrypt,
+	.decrypt		= cbc_decrypt,
+}, {
+	.base.cra_name		= "__ctr(aes)",
+	.base.cra_driver_name	= "__ctr-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "__xts(aes)",
+	.base.cra_driver_name	= "__xts-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_xts_setkey,
+	.encrypt		= xts_encrypt,
+	.decrypt		= xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);
+
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+	struct simd_skcipher_alg *simd;
+	const char *basename;
+	const char *algname;
+	const char *drvname;
+	int err;
+	int i;
+
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
+		algname = aes_algs[i].base.cra_name + 2;
+		drvname = aes_algs[i].base.cra_driver_name + 2;
+		basename = aes_algs[i].base.cra_driver_name;
+		simd = simd_skcipher_create_compat(algname, drvname, basename);
+		err = PTR_ERR(simd);
+		if (IS_ERR(simd))
+			goto unregister_simds;
+
+		aes_simd_algs[i] = simd;
+	}
+	return 0;
+
+unregister_simds:
+	aes_exit();
+	return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@ -1,98 +0,0 @@
-/*
- * Glue Code for the asm optimized version of the AES Cipher Algorithm
- */
-
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <crypto/aes.h>
-
-#include "aes_glue.h"
-
-EXPORT_SYMBOL(AES_encrypt);
-EXPORT_SYMBOL(AES_decrypt);
-EXPORT_SYMBOL(private_AES_set_encrypt_key);
-EXPORT_SYMBOL(private_AES_set_decrypt_key);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-	AES_encrypt(src, dst, &ctx->enc_key);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-	AES_decrypt(src, dst, &ctx->dec_key);
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		unsigned int key_len)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-
-	switch (key_len) {
-	case AES_KEYSIZE_128:
-		key_len = 128;
-		break;
-	case AES_KEYSIZE_192:
-		key_len = 192;
-		break;
-	case AES_KEYSIZE_256:
-		key_len = 256;
-		break;
-	default:
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-
-	if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	/* private_AES_set_decrypt_key expects an encryption key as input */
-	ctx->dec_key = ctx->enc_key;
-	if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		= "aes",
-	.cra_driver_name	= "aes-asm",
-	.cra_priority		= 200,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct AES_CTX),
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u	= {
-		.cipher	= {
-			.cia_min_keysize	= AES_MIN_KEY_SIZE,
-			.cia_max_keysize	= AES_MAX_KEY_SIZE,
-			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= aes_encrypt,
-			.cia_decrypt		= aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("aes");
-MODULE_ALIAS_CRYPTO("aes-asm");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
--- a/arch/arm/crypto/aes_glue.h
+++ b/arch/arm/crypto/aes_glue.h
@ -1,19 +0,0 @@
-
-#define AES_MAXNR 14
-
-struct AES_KEY {
-	unsigned int rd_key[4 * (AES_MAXNR + 1)];
-	int rounds;
-};
-
-struct AES_CTX {
-	struct AES_KEY enc_key;
-	struct AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
-					   const int bits, struct AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
-					   const int bits, struct AES_KEY *key);
--- a/arch/arm/crypto/aesbs-core.S_shipped
+++ b/arch/arm/crypto/aesbs-core.S_shipped
--- a/arch/arm/crypto/aesbs-glue.c
+++ b/arch/arm/crypto/aesbs-glue.c
@ -1,367 +0,0 @@
-/*
- * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
- *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <crypto/aes.h>
-#include <crypto/cbc.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/module.h>
-#include <crypto/xts.h>
-
-#include "aes_glue.h"
-
-#define BIT_SLICED_KEY_MAXSIZE	(128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
-
-struct BS_KEY {
-	struct AES_KEY	rk;
-	int		converted;
-	u8 __aligned(8)	bs[BIT_SLICED_KEY_MAXSIZE];
-} __aligned(8);
-
-asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
-asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
-
-asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 iv[]);
-
-asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
-					   struct BS_KEY *key, u8 const iv[]);
-
-asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 tweak[]);
-
-asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 tweak[]);
-
-struct aesbs_cbc_ctx {
-	struct AES_KEY	enc;
-	struct BS_KEY	dec;
-};
-
-struct aesbs_ctr_ctx {
-	struct BS_KEY	enc;
-};
-
-struct aesbs_xts_ctx {
-	struct BS_KEY	enc;
-	struct BS_KEY	dec;
-	struct AES_KEY	twkey;
-};
-
-static int aesbs_cbc_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 8;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->dec.rk = ctx->enc;
-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
-	ctx->dec.converted = 0;
-	return 0;
-}
-
-static int aesbs_ctr_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 8;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->enc.converted = 0;
-	return 0;
-}
-
-static int aesbs_xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 4;
-	int err;
-
-	err = xts_verify_key(tfm, in_key, key_len);
-	if (err)
-		return err;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->dec.rk = ctx->enc.rk;
-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
-	private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
-	ctx->enc.converted = ctx->dec.converted = 0;
-	return 0;
-}
-
-static inline void aesbs_encrypt_one(struct crypto_skcipher *tfm,
-				     const u8 *src, u8 *dst)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	AES_encrypt(src, dst, &ctx->enc);
-}
-
-static int aesbs_cbc_encrypt(struct skcipher_request *req)
-{
-	return crypto_cbc_encrypt_walk(req, aesbs_encrypt_one);
-}
-
-static inline void aesbs_decrypt_one(struct crypto_skcipher *tfm,
-				     const u8 *src, u8 *dst)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	AES_decrypt(src, dst, &ctx->dec.rk);
-}
-
-static int aesbs_cbc_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	unsigned int nbytes;
-	int err;
-
-	for (err = skcipher_walk_virt(&walk, req, false);
-	     (nbytes = walk.nbytes); err = skcipher_walk_done(&walk, nbytes)) {
-		u32 blocks = nbytes / AES_BLOCK_SIZE;
-		u8 *dst = walk.dst.virt.addr;
-		u8 *src = walk.src.virt.addr;
-		u8 *iv = walk.iv;
-
-		if (blocks >= 8) {
-			kernel_neon_begin();
-			bsaes_cbc_encrypt(src, dst, nbytes, &ctx->dec, iv);
-			kernel_neon_end();
-			nbytes %= AES_BLOCK_SIZE;
-			continue;
-		}
-
-		nbytes = crypto_cbc_decrypt_blocks(&walk, tfm,
-						   aesbs_decrypt_one);
-	}
-	return err;
-}
-
-static void inc_be128_ctr(__be32 ctr[], u32 addend)
-{
-	int i;
-
-	for (i = 3; i >= 0; i--, addend = 1) {
-		u32 n = be32_to_cpu(ctr[i]) + addend;
-
-		ctr[i] = cpu_to_be32(n);
-		if (n >= addend)
-			break;
-	}
-}
-
-static int aesbs_ctr_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 blocks;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
-		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		__be32 *ctr = (__be32 *)walk.iv;
-		u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
-
-		/* avoid 32 bit counter overflow in the NEON code */
-		if (unlikely(headroom < blocks)) {
-			blocks = headroom + 1;
-			tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
-		}
-		kernel_neon_begin();
-		bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
-					   walk.dst.virt.addr, blocks,
-					   &ctx->enc, walk.iv);
-		kernel_neon_end();
-		inc_be128_ctr(ctr, blocks);
-
-		err = skcipher_walk_done(&walk, tail);
-	}
-	if (walk.nbytes) {
-		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
-		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
-		u8 ks[AES_BLOCK_SIZE];
-
-		AES_encrypt(walk.iv, ks, &ctx->enc.rk);
-		if (tdst != tsrc)
-			memcpy(tdst, tsrc, walk.nbytes);
-		crypto_xor(tdst, ks, walk.nbytes);
-		err = skcipher_walk_done(&walk, 0);
-	}
-	return err;
-}
-
-static int aesbs_xts_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	/* generate the initial tweak */
-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
-
-	while (walk.nbytes) {
-		kernel_neon_begin();
-		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->enc, walk.iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static int aesbs_xts_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	/* generate the initial tweak */
-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
-
-	while (walk.nbytes) {
-		kernel_neon_begin();
-		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->dec, walk.iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static struct skcipher_alg aesbs_algs[] = { {
-	.base = {
-		.cra_name		= "__cbc(aes)",
-		.cra_driver_name	= "__cbc-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.setkey		= aesbs_cbc_set_key,
-	.encrypt	= aesbs_cbc_encrypt,
-	.decrypt	= aesbs_cbc_decrypt,
-}, {
-	.base = {
-		.cra_name		= "__ctr(aes)",
-		.cra_driver_name	= "__ctr-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.chunksize	= AES_BLOCK_SIZE,
-	.setkey		= aesbs_ctr_set_key,
-	.encrypt	= aesbs_ctr_encrypt,
-	.decrypt	= aesbs_ctr_encrypt,
-}, {
-	.base = {
-		.cra_name		= "__xts(aes)",
-		.cra_driver_name	= "__xts-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
-	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.setkey		= aesbs_xts_set_key,
-	.encrypt	= aesbs_xts_encrypt,
-	.decrypt	= aesbs_xts_decrypt,
-} };
-
-struct simd_skcipher_alg *aesbs_simd_algs[ARRAY_SIZE(aesbs_algs)];
-
-static void aesbs_mod_exit(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(aesbs_simd_algs) && aesbs_simd_algs[i]; i++)
-		simd_skcipher_free(aesbs_simd_algs[i]);
-
-	crypto_unregister_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
-}
-
-static int __init aesbs_mod_init(void)
-{
-	struct simd_skcipher_alg *simd;
-	const char *basename;
-	const char *algname;
-	const char *drvname;
-	int err;
-	int i;
-
-	if (!cpu_has_neon())
-		return -ENODEV;
-
-	err = crypto_register_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
-	if (err)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(aesbs_algs); i++) {
-		algname = aesbs_algs[i].base.cra_name + 2;
-		drvname = aesbs_algs[i].base.cra_driver_name + 2;
-		basename = aesbs_algs[i].base.cra_driver_name;
-		simd = simd_skcipher_create_compat(algname, drvname, basename);
-		err = PTR_ERR(simd);
-		if (IS_ERR(simd))
-			goto unregister_simds;
-
-		aesbs_simd_algs[i] = simd;
-	}
-
-	return 0;
-
-unregister_simds:
-	aesbs_mod_exit();
-	return err;
-}
-
-module_init(aesbs_mod_init);
-module_exit(aesbs_mod_exit);
-
-MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL");
--- a/arch/arm/crypto/bsaes-armv7.pl
+++ b/arch/arm/crypto/bsaes-armv7.pl
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@ -0,0 +1,523 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+ENTRY(chacha20_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requireds shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	mov		r3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #1
+	bne		.Ldoubleround
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	bx		lr
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		5
+ENTRY(chacha20_4block_xor_neon)
+	push		{r4-r6, lr}
+	mov		ip, sp			// preserve the stack pointer
+	sub		r3, sp, #0x20		// allocate a 32 byte buffer
+	bic		r3, r3, #0x1f		// aligned to 32 bytes
+	mov		sp, r3
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		r3, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [r3]
+
+	adr		r3, CTRINC
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q11}, [r3, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	mov		r3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q8, q12, q0
+	veor		q9, q13, q1
+	vshl.u32	q12, q8, #8
+	vshl.u32	q13, q9, #8
+	vsri.u32	q12, q8, #24
+	vsri.u32	q13, q9, #24
+
+	veor		q8, q14, q2
+	veor		q9, q15, q3
+	vshl.u32	q14, q8, #8
+	vshl.u32	q15, q9, #8
+	vsri.u32	q14, q8, #24
+	vsri.u32	q15, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q8, q15, q0
+	veor		q9, q12, q1
+	vshl.u32	q15, q8, #8
+	vshl.u32	q12, q9, #8
+	vsri.u32	q15, q8, #24
+	vsri.u32	q12, q9, #24
+
+	veor		q8, q13, q2
+	veor		q9, q14, q3
+	vshl.u32	q13, q8, #8
+	vshl.u32	q14, q9, #8
+	vsri.u32	q13, q8, #24
+	vsri.u32	q14, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #1
+	beq		0f
+
+	vld1.32		{q8-q9}, [sp, :256]
+	b		.Ldoubleround4
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+0:	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q0, q0, q8
+	vadd.i32	q1, q1, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q2, q2, q8
+	vadd.i32	q3, q3, q9
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q4, q4, q8
+	vadd.i32	q5, q5, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q6, q6, q8
+	vadd.i32	q7, q7, q9
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q0, q1
+	vzip.32		q2, q3
+	vzip.32		q4, q5
+	vzip.32		q6, q7
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d1, d4
+	vswp		d3, d6
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// xor with corresponding input, write to output
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q4
+	vst1.8		{q8-q9}, [r1]!
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	vadd.i32	q8, q8, q0
+	vadd.i32	q9, q9, q4
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q10, q10, q0
+	vadd.i32	q11, q11, q4
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	adr		r3, CTRINC
+	vadd.i32	q12, q12, q0
+	vld1.32		{q0}, [r3, :128]
+	vadd.i32	q13, q13, q4
+	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
+
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q14, q14, q0
+	vadd.i32	q15, q15, q4
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q8, q9
+	vzip.32		q10, q11
+	vzip.32		q12, q13
+	vzip.32		q14, q15
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d17, d20
+	vswp		d19, d22
+	vswp		d25, d28
+	vswp		d27, d30
+
+	vmov		q4, q1
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	mov		sp, ip
+	pop		{r4-r6, pc}
+ENDPROC(chacha20_4block_xor_neon)
+
+	.align		4
+CTRINC:	.word		0, 1, 2, 3
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@ -0,0 +1,127 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha20_init(state, ctx, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+				nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-neon",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_neon,
+	.decrypt		= chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@ -516,4 +516,3 @@ CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 # CONFIG_CRYPTO_AES_ARM64_NEON_BLK is not set
-CONFIG_CRYPTO_CRC32_ARM64=y
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@ -37,10 +37,14 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	select CRYPTO_HASH

 config CRYPTO_CRC32_ARM64_CE
-	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
-	depends on KERNEL_MODE_NEON && CRC32
+	tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
+	depends on CRC32
 	select CRYPTO_HASH

+config CRYPTO_AES_ARM64
+	tristate "AES core cipher using scalar instructions"
+	select CRYPTO_AES
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
@ -67,9 +71,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_AES
 	select CRYPTO_SIMD

-config CRYPTO_CRC32_ARM64
-	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
-	depends on ARM64
-	select CRYPTO_HASH
+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
+config CRYPTO_AES_ARM64_BS
+	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES_ARM64_NEON_BLK
+	select CRYPTO_SIMD

 endif
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@ -41,15 +41,20 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o

+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4

 CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS

-obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o
-
-CFLAGS_crc32-arm64.o	:= -mcpu=generic+crc
-
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
 	$(call if_changed_rule,cc_o_c)

--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@ -258,7 +258,6 @@ static struct aead_alg ccm_aes_alg = {
 		.cra_priority		= 300,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.ivsize		= AES_BLOCK_SIZE,
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@ -0,0 +1,110 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	rk		.req	x0
+	out		.req	x1
+	in		.req	x2
+	rounds		.req	x3
+	tt		.req	x4
+	lt		.req	x2
+
+	.macro		__pair, enc, reg0, reg1, in0, in1e, in1d, shift
+	ubfx		\reg0, \in0, #\shift, #8
+	.if		\enc
+	ubfx		\reg1, \in1e, #\shift, #8
+	.else
+	ubfx		\reg1, \in1d, #\shift, #8
+	.endif
+	ldr		\reg0, [tt, \reg0, uxtw #2]
+	ldr		\reg1, [tt, \reg1, uxtw #2]
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+	ldp		\out0, \out1, [rk], #8
+
+	__pair		\enc, w13, w14, \in0, \in1, \in3, 0
+	__pair		\enc, w15, w16, \in1, \in2, \in0, 8
+	__pair		\enc, w17, w18, \in2, \in3, \in1, 16
+	__pair		\enc, \t0, \t1, \in3, \in0, \in2, 24
+
+	eor		\out0, \out0, w13
+	eor		\out1, \out1, w14
+	eor		\out0, \out0, w15, ror #24
+	eor		\out1, \out1, w16, ror #24
+	eor		\out0, \out0, w17, ror #16
+	eor		\out1, \out1, w18, ror #16
+	eor		\out0, \out0, \t0, ror #8
+	eor		\out1, \out1, \t1, ror #8
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab
+	ldp		w5, w6, [in]
+	ldp		w7, w8, [in, #8]
+	ldp		w9, w10, [rk], #16
+	ldp		w11, w12, [rk, #-8]
+
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+CPU_BE(	rev		w8, w8		)
+
+	eor		w5, w5, w9
+	eor		w6, w6, w10
+	eor		w7, w7, w11
+	eor		w8, w8, w12
+
+	adr_l		tt, \ttab
+	adr_l		lt, \ltab
+
+	tbnz		rounds, #1, 1f
+
+0:	\round		w9, w10, w11, w12, w5, w6, w7, w8
+	\round		w5, w6, w7, w8, w9, w10, w11, w12
+
+1:	subs		rounds, rounds, #4
+	\round		w9, w10, w11, w12, w5, w6, w7, w8
+	csel		tt, tt, lt, hi
+	\round		w5, w6, w7, w8, w9, w10, w11, w12
+	b.hi		0b
+
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+CPU_BE(	rev		w8, w8		)
+
+	stp		w5, w6, [out]
+	stp		w7, w8, [out, #8]
+	ret
+	.endm
+
+	.align		5
+ENTRY(__aes_arm64_encrypt)
+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
+ENDPROC(__aes_arm64_encrypt)
+
+	.align		5
+ENTRY(__aes_arm64_decrypt)
+	do_crypt	iround, crypto_it_tab, crypto_il_tab
+ENDPROC(__aes_arm64_decrypt)
--- a/arch/arm64/crypto/aes-cipher-glue.c
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@ -0,0 +1,69 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_encrypt);
+
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm64_decrypt(ctx->key_dec, out, in, rounds);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-arm64",
+	.cra_priority			= 200,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for arm64");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@ -1,7 +1,7 @@
 /*
 * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
 *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -11,6 +11,7 @@
 #include <asm/neon.h>
 #include <asm/hwcap.h>
 #include <crypto/aes.h>
+#include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/module.h>
@ -31,6 +32,7 @@
 #define aes_ctr_encrypt		ce_aes_ctr_encrypt
 #define aes_xts_encrypt		ce_aes_xts_encrypt
 #define aes_xts_decrypt		ce_aes_xts_decrypt
+#define aes_mac_update		ce_aes_mac_update
 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #else
 #define MODE			"neon"
@ -44,11 +46,15 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #define aes_ctr_encrypt		neon_aes_ctr_encrypt
 #define aes_xts_encrypt		neon_aes_xts_encrypt
 #define aes_xts_decrypt		neon_aes_xts_decrypt
+#define aes_mac_update		neon_aes_mac_update
 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
 MODULE_ALIAS_CRYPTO("ecb(aes)");
 MODULE_ALIAS_CRYPTO("cbc(aes)");
 MODULE_ALIAS_CRYPTO("ctr(aes)");
 MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("cmac(aes)");
+MODULE_ALIAS_CRYPTO("xcbc(aes)");
+MODULE_ALIAS_CRYPTO("cbcmac(aes)");
 #endif

 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@ -75,11 +81,25 @@ asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
 				int rounds, int blocks, u8 const rk2[], u8 iv[],
 				int first);

+asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+			       int blocks, u8 dg[], int enc_before,
+			       int enc_after);
+
 struct crypto_aes_xts_ctx {
 	struct crypto_aes_ctx key1;
 	struct crypto_aes_ctx __aligned(8) key2;
 };

+struct mac_tfm_ctx {
+	struct crypto_aes_ctx key;
+	u8 __aligned(8) consts[];
+};
+
+struct mac_desc_ctx {
+	unsigned int len;
+	u8 dg[AES_BLOCK_SIZE];
+};
+
 static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			       unsigned int key_len)
 {
@ -215,14 +235,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 		u8 *tsrc = walk.src.virt.addr;

 		/*
-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
-		 * to tell aes_ctr_encrypt() to only read half a block.
+		 * Tell aes_ctr_encrypt() to process a tail block.
 		 */
-		blocks = (nbytes <= 8) ? -1 : 1;
+		blocks = -1;

-		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
 				blocks, walk.iv, first);
-		memcpy(tdst, tail, nbytes);
+		if (tdst != tsrc)
+			memcpy(tdst, tsrc, nbytes);
+		crypto_xor(tdst, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
@ -282,7 +303,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -298,7 +318,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -315,7 +334,22 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+	},
+	.min_keysize	= AES_MIN_KEY_SIZE,
+	.max_keysize	= AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.chunksize	= AES_BLOCK_SIZE,
+	.setkey		= skcipher_aes_setkey,
+	.encrypt	= ctr_encrypt,
+	.decrypt	= ctr_encrypt,
+}, {
+	.base = {
+		.cra_name		= "ctr(aes)",
+		.cra_driver_name	= "ctr-aes-" MODE,
+		.cra_priority		= PRIO - 1,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@ -333,7 +367,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
@ -344,15 +377,228 @@ static struct skcipher_alg aes_algs[] = { {
 	.decrypt	= xts_decrypt,
 } };

+static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+			 unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	int err;
+
+	err = aes_expandkey(&ctx->key, in_key, key_len);
+	if (err)
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+
+	return err;
+}
+
+static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+
+	y->a = cpu_to_be64((a << 1) | (b >> 63));
+	y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+}
+
+static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	be128 *consts = (be128 *)ctx->consts;
+	u8 *rk = (u8 *)ctx->key.key_enc;
+	int rounds = 6 + key_len / 4;
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	/* encrypt the zero vector */
+	kernel_neon_begin();
+	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
+	kernel_neon_end();
+
+	cmac_gf128_mul_by_x(consts, consts);
+	cmac_gf128_mul_by_x(consts + 1, consts);
+
+	return 0;
+}
+
+static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	static u8 const ks[3][AES_BLOCK_SIZE] = {
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
+	};
+
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	u8 *rk = (u8 *)ctx->key.key_enc;
+	int rounds = 6 + key_len / 4;
+	u8 key[AES_BLOCK_SIZE];
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	kernel_neon_begin();
+	aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
+	aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
+	kernel_neon_end();
+
+	return cbcmac_setkey(tfm, key, sizeof(key));
+}
+
+static int mac_init(struct shash_desc *desc)
+{
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	memset(ctx->dg, 0, AES_BLOCK_SIZE);
+	ctx->len = 0;
+
+	return 0;
+}
+
+static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+
+	while (len > 0) {
+		unsigned int l;
+
+		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
+		    (ctx->len + len) > AES_BLOCK_SIZE) {
+
+			int blocks = len / AES_BLOCK_SIZE;
+
+			len %= AES_BLOCK_SIZE;
+
+			kernel_neon_begin();
+			aes_mac_update(p, tctx->key.key_enc, rounds, blocks,
+				       ctx->dg, (ctx->len != 0), (len != 0));
+			kernel_neon_end();
+
+			p += blocks * AES_BLOCK_SIZE;
+
+			if (!len) {
+				ctx->len = AES_BLOCK_SIZE;
+				break;
+			}
+			ctx->len = 0;
+		}
+
+		l = min(len, AES_BLOCK_SIZE - ctx->len);
+
+		if (l <= AES_BLOCK_SIZE) {
+			crypto_xor(ctx->dg + ctx->len, p, l);
+			ctx->len += l;
+			len -= l;
+			p += l;
+		}
+	}
+
+	return 0;
+}
+
+static int cbcmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+
+	kernel_neon_begin();
+	aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0);
+	kernel_neon_end();
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+static int cmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+	u8 *consts = tctx->consts;
+
+	if (ctx->len != AES_BLOCK_SIZE) {
+		ctx->dg[ctx->len] ^= 0x80;
+		consts += AES_BLOCK_SIZE;
+	}
+
+	kernel_neon_begin();
+	aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1);
+	kernel_neon_end();
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg mac_algs[] = { {
+	.base.cra_name		= "cmac(aes)",
+	.base.cra_driver_name	= "cmac-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= cmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "xcbc(aes)",
+	.base.cra_driver_name	= "xcbc-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= xcbc_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "cbcmac(aes)",
+	.base.cra_driver_name	= "cbcmac-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cbcmac_final,
+	.setkey			= cbcmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+} };
+
 static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];

 static void aes_exit(void)
 {
 	int i;

-	for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
-		simd_skcipher_free(aes_simd_algs[i]);
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);

+	crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }

@ -369,7 +615,14 @@ static int __init aes_init(void)
 	if (err)
 		return err;

+	err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs));
+	if (err)
+		goto unregister_ciphers;
+
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
 		algname = aes_algs[i].base.cra_name + 2;
 		drvname = aes_algs[i].base.cra_driver_name + 2;
 		basename = aes_algs[i].base.cra_driver_name;
@ -385,6 +638,8 @@ static int __init aes_init(void)

 unregister_simds:
 	aes_exit();
+unregister_ciphers:
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 	return err;
 }

@ -392,5 +647,7 @@ static int __init aes_init(void)
 module_cpu_feature_match(AES, aes_init);
 #else
 module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
 #endif
 module_exit(aes_exit);
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@ -1,7 +1,7 @@
 /*
 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
 *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -337,7 +337,7 @@ AES_ENTRY(aes_ctr_encrypt)

 .Lctrcarrydone:
 	subs		w4, w4, #1
-	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
+	bmi		.Lctrtailblock		/* blocks <0 means tail block */
 	ld1		{v3.16b}, [x1], #16
 	eor		v3.16b, v0.16b, v3.16b
 	st1		{v3.16b}, [x0], #16
@ -348,10 +348,8 @@ AES_ENTRY(aes_ctr_encrypt)
 	FRAME_POP
 	ret

-.Lctrhalfblock:
-	ld1		{v3.8b}, [x1]
-	eor		v3.8b, v0.8b, v3.8b
-	st1		{v3.8b}, [x0]
+.Lctrtailblock:
+	st1		{v0.16b}, [x0]
 	FRAME_POP
 	ret

@ -527,3 +525,30 @@ AES_ENTRY(aes_xts_decrypt)
 	FRAME_POP
 	ret
 AES_ENDPROC(aes_xts_decrypt)
+
+	/*
+	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
+	 */
+AES_ENTRY(aes_mac_update)
+	ld1		{v0.16b}, [x4]			/* get dg */
+	enc_prepare	w2, x1, x7
+	cbnz		w5, .Lmacenc
+
+.Lmacloop:
+	cbz		w3, .Lmacout
+	ld1		{v1.16b}, [x0], #16		/* get next pt block */
+	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
+
+	subs		w3, w3, #1
+	csinv		x5, x6, xzr, eq
+	cbz		w5, .Lmacout
+
+.Lmacenc:
+	encrypt_block	v0, w2, x1, x7, w8
+	b		.Lmacloop
+
+.Lmacout:
+	st1		{v0.16b}, [x4]			/* return dg */
+	ret
+AES_ENDPROC(aes_mac_update)
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@ -1,7 +1,7 @@
 /*
 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
 *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -17,17 +17,25 @@
 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
 	sshr		\temp, \in, #7
-	add		\out, \in, \in
+	shl		\out, \in, #1
 	and		\temp, \temp, \const
 	eor		\out, \out, \temp
 	.endm

+	/* multiply by polynomial 'x^2' in GF(2^8) */
+	.macro		mul_by_x2, out, in, temp, const
+	ushr		\temp, \in, #6
+	shl		\out, \in, #2
+	pmul		\temp, \temp, \const
+	eor		\out, \out, \temp
+	.endm
+
 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
 	adr		\temp, \sbox
-	movi		v12.16b, #0x40
+	movi		v12.16b, #0x1b
 	ldr		q13, \shiftrows
-	movi		v14.16b, #0x1b
+	ldr		q14, .Lror32by8
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@ -50,37 +58,31 @@

 	/* apply SubBytes transformation using the the preloaded Sbox */
 	.macro		sub_bytes, in
-	sub		v9.16b, \in\().16b, v12.16b
+	sub		v9.16b, \in\().16b, v15.16b
 	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
-	sub		v10.16b, v9.16b, v12.16b
+	sub		v10.16b, v9.16b, v15.16b
 	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v11.16b, v10.16b, v12.16b
+	sub		v11.16b, v10.16b, v15.16b
 	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
 	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm

 	/* apply MixColumns transformation */
-	.macro		mix_columns, in
-	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
-	rev32		v8.8h, \in\().8h
-	eor		\in\().16b, v10.16b, \in\().16b
-	shl		v9.4s, v8.4s, #24
-	shl		v11.4s, \in\().4s, #24
-	sri		v9.4s, v8.4s, #8
-	sri		v11.4s, \in\().4s, #8
-	eor		v9.16b, v9.16b, v8.16b
-	eor		v10.16b, v10.16b, v9.16b
-	eor		\in\().16b, v10.16b, v11.16b
-	.endm
-
+	.macro		mix_columns, in, enc
+	.if		\enc == 0
 	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
-	.macro		inv_mix_columns, in
-	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
-	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
-	eor		\in\().16b, \in\().16b, v11.16b
-	rev32		v11.8h, v11.8h
-	eor		\in\().16b, \in\().16b, v11.16b
-	mix_columns	\in
+	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in\().16b, \in\().16b, v8.16b
+	.endif
+
+	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
+	rev32		v8.8h, \in\().8h
+	eor		v8.16b, v8.16b, v9.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	tbl		\in\().16b, {\in\().16b}, v14.16b
+	eor		\in\().16b, \in\().16b, v8.16b
 	.endm

 	.macro		do_block, enc, in, rounds, rk, rkp, i
@ -88,16 +90,13 @@
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns	\in
-	.else
-	inv_mix_columns	\in
-	.endif
+	mix_columns	\in, \enc
 	b		1111b
 2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
@ -116,139 +115,114 @@
 	 */

 	.macro		sub_bytes_2x, in0, in1
-	sub		v8.16b, \in0\().16b, v12.16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, v8.16b, v12.16b
-	sub		v11.16b, v9.16b, v12.16b
+	sub		v10.16b, v8.16b, v15.16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	sub		v11.16b, v9.16b, v15.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v10.16b, v12.16b
-	sub		v9.16b, v11.16b, v12.16b
+	sub		v8.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	.endm

 	.macro		sub_bytes_4x, in0, in1, in2, in3
-	sub		v8.16b, \in0\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, \in2\().16b, v12.16b
+	sub		v10.16b, \in2\().16b, v15.16b
 	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
-	sub		v11.16b, \in3\().16b, v12.16b
+	sub		v11.16b, \in3\().16b, v15.16b
 	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
 	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm

 	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
-	sshr		\tmp0\().16b, \in0\().16b,  #7
-	add		\out0\().16b, \in0\().16b,  \in0\().16b
-	sshr		\tmp1\().16b, \in1\().16b,  #7
+	sshr		\tmp0\().16b, \in0\().16b, #7
+	shl		\out0\().16b, \in0\().16b, #1
+	sshr		\tmp1\().16b, \in1\().16b, #7
 	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
-	add		\out1\().16b, \in1\().16b,  \in1\().16b
+	shl		\out1\().16b, \in1\().16b, #1
 	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
 	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
 	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm

-	.macro		mix_columns_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
+	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+	ushr		\tmp0\().16b, \in0\().16b, #6
+	shl		\out0\().16b, \in0\().16b, #2
+	ushr		\tmp1\().16b, \in1\().16b, #6
+	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
+	shl		\out1\().16b, \in1\().16b, #2
+	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
+	.endm
+
+	.macro		mix_columns_2x, in0, in1, enc
+	.if		\enc == 0
+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	rev32		v9.8h, v9.8h
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	.endif
+
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
 	rev32		v10.8h, \in0\().8h
 	rev32		v11.8h, \in1\().8h
-	eor		\in0\().16b, v8.16b, \in0\().16b
-	eor		\in1\().16b, v9.16b, \in1\().16b
-	shl		v12.4s, v10.4s, #24
-	shl		v13.4s, v11.4s, #24
-	eor		v8.16b, v8.16b, v10.16b
-	sri		v12.4s, v10.4s, #8
-	shl		v10.4s, \in0\().4s, #24
-	eor		v9.16b, v9.16b, v11.16b
-	sri		v13.4s, v11.4s, #8
-	shl		v11.4s, \in1\().4s, #24
-	sri		v10.4s, \in0\().4s, #8
-	eor		\in0\().16b, v8.16b, v12.16b
-	sri		v11.4s, \in1\().4s, #8
-	eor		\in1\().16b, v9.16b, v13.16b
-	eor		\in0\().16b, v10.16b, \in0\().16b
-	eor		\in1\().16b, v11.16b, \in1\().16b
+	eor		v10.16b, v10.16b, v8.16b
+	eor		v11.16b, v11.16b, v9.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
+	tbl		\in0\().16b, {\in0\().16b}, v14.16b
+	tbl		\in1\().16b, {\in1\().16b}, v14.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm

-	.macro		inv_mix_cols_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	mix_columns_2x	\in0, \in1
-	.endm
-
-	.macro		inv_mix_cols_4x, in0, in1, in2, in3
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
-	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
-	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	rev32		v10.8h, v10.8h
-	rev32		v11.8h, v11.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	.endm
-
-	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	sub_bytes_2x	\in0, \in1
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_2x	\in0, \in1
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_2x	\in0, \in1
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@ -262,23 +236,17 @@
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
-	sub_bytes_4x	\in0, \in1, \in2, \in3
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_4x	\in0, \in1, \in2, \in3
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_4x	\in0, \in1, \in2, \in3
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
+	mix_columns_2x	\in2, \in3, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@ -305,19 +273,7 @@
 #include "aes-modes.S"

 	.text
-	.align		4
-.LForward_ShiftRows:
-CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
-CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
-CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
-CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
-
-.LReverse_ShiftRows:
-CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
-CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
-CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
-CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
-
+	.align		6
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
 	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@ -385,3 +341,12 @@ CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+	.octa		0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+	.octa		0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+	.octa		0x0c0f0e0d080b0a090407060500030201
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@ -0,0 +1,972 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	rounds		.req	x11
+	bskey		.req	x12
+
+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b2, \b2, \b1
+	eor		\b5, \b5, \b6
+	eor		\b3, \b3, \b0
+	eor		\b6, \b6, \b2
+	eor		\b5, \b5, \b0
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor		\b4, \b4, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b1, \b1, \b5
+	.endm
+
+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	eor		\b4, \b4, \b6
+	eor		\b2, \b2, \b0
+	eor		\b6, \b6, \b1
+	eor		\b1, \b1, \b5
+	eor		\b5, \b5, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b2, \b2, \b5
+	eor		\b4, \b4, \b7
+	.endm
+
+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+	eor		\b1, \b1, \b7
+	eor		\b4, \b4, \b7
+	eor		\b7, \b7, \b5
+	eor		\b1, \b1, \b3
+	eor		\b2, \b2, \b5
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b1
+	eor		\b2, \b2, \b0
+	eor		\b5, \b5, \b3
+	eor		\b4, \b4, \b6
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	.endm
+
+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+	eor		\b1, \b1, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b4, \b4, \b5
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor 		\b5, \b5, \b0
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b2
+	eor		\b2, \b2, \b1
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b0
+	eor		\b5, \b5, \b6
+	.endm
+
+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
+	eor 		\t0, \y0, \y1
+	and		\t0, \t0, \x0
+	eor		\x0, \x0, \x1
+	and		\t1, \x1, \y0
+	and		\x0, \x0, \y1
+	eor		\x1, \t1, \t0
+	eor		\x0, \x0, \t1
+	.endm
+
+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+	eor		\t0, \y0, \y1
+	eor 		\t1, \y2, \y3
+	and		\t0, \t0, \x0
+	and		\t1, \t1, \x2
+	eor		\x0, \x0, \x1
+	eor		\x2, \x2, \x3
+	and		\x1, \x1, \y0
+	and		\x3, \x3, \y2
+	and		\x0, \x0, \y1
+	and		\x2, \x2, \y3
+	eor		\x1, \x1, \x0
+	eor		\x2, \x2, \x3
+	eor		\x0, \x0, \t0
+	eor		\x3, \x3, \t1
+	.endm
+
+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+				    y0, y1, y2, y3, t0, t1, t2, t3
+	eor		\t0, \x0, \x2
+	eor		\t1, \x1, \x3
+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+	eor		\x0, \x0, \t0
+	eor		\x2, \x2, \t0
+	eor		\x1, \x1, \t1
+	eor		\x3, \x3, \t1
+	eor		\t0, \x4, \x6
+	eor		\t1, \x5, \x7
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
+	eor		\x4, \x4, \t0
+	eor		\x6, \x6, \t0
+	eor		\x5, \x5, \t1
+	eor		\x7, \x7, \t1
+	.endm
+
+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+				   t0, t1, t2, t3, s0, s1, s2, s3
+	eor		\t3, \x4, \x6
+	eor		\t0, \x5, \x7
+	eor		\t1, \x1, \x3
+	eor		\s1, \x7, \x6
+	eor		\s0, \x0, \x2
+	eor		\s3, \t3, \t0
+	orr		\t2, \t0, \t1
+	and		\s2, \t3, \s0
+	orr		\t3, \t3, \s0
+	eor		\s0, \s0, \t1
+	and		\t0, \t0, \t1
+	eor		\t1, \x3, \x2
+	and		\s3, \s3, \s0
+	and		\s1, \s1, \t1
+	eor		\t1, \x4, \x5
+	eor		\s0, \x1, \x0
+	eor		\t3, \t3, \s1
+	eor		\t2, \t2, \s1
+	and		\s1, \t1, \s0
+	orr		\t1, \t1, \s0
+	eor		\t3, \t3, \s3
+	eor		\t0, \t0, \s1
+	eor		\t2, \t2, \s2
+	eor		\t1, \t1, \s3
+	eor		\t0, \t0, \s2
+	and		\s0, \x7, \x3
+	eor		\t1, \t1, \s2
+	and		\s1, \x6, \x2
+	and		\s2, \x5, \x1
+	orr		\s3, \x4, \x0
+	eor		\t3, \t3, \s0
+	eor		\t1, \t1, \s2
+	eor		\s0, \t0, \s3
+	eor		\t2, \t2, \s1
+	and		\s2, \t3, \t1
+	eor		\s1, \t2, \s2
+	eor		\s3, \s0, \s2
+	bsl		\s1, \t1, \s0
+	not		\t0, \s0
+	bsl		\s0, \s1, \s3
+	bsl		\t0, \s1, \s3
+	bsl		\s3, \t3, \t2
+	eor		\t3, \t3, \t2
+	and		\s2, \s0, \s3
+	eor		\t1, \t1, \t0
+	eor		\s2, \s2, \t3
+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	.endm
+
+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+			      t0, t1, t2, t3, s0, s1, s2, s3
+	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+	.endm
+
+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+				  t0, t1, t2, t3, s0, s1, s2, s3
+	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+	.endm
+
+	.macro		enc_next_rk
+	ldp		q16, q17, [bskey], #128
+	ldp		q18, q19, [bskey, #-96]
+	ldp		q20, q21, [bskey, #-64]
+	ldp		q22, q23, [bskey, #-32]
+	.endm
+
+	.macro		dec_next_rk
+	ldp		q16, q17, [bskey, #-128]!
+	ldp		q18, q19, [bskey, #32]
+	ldp		q20, q21, [bskey, #64]
+	ldp		q22, q23, [bskey, #96]
+	.endm
+
+	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+	eor		\x0\().16b, \x0\().16b, v16.16b
+	eor		\x1\().16b, \x1\().16b, v17.16b
+	eor		\x2\().16b, \x2\().16b, v18.16b
+	eor		\x3\().16b, \x3\().16b, v19.16b
+	eor		\x4\().16b, \x4\().16b, v20.16b
+	eor		\x5\().16b, \x5\().16b, v21.16b
+	eor		\x6\().16b, \x6\().16b, v22.16b
+	eor		\x7\().16b, \x7\().16b, v23.16b
+	.endm
+
+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
+	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
+	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
+	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
+	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
+	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
+	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
+	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
+	.endm
+
+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
+	eor		\x2\().16b, \x2\().16b, \t2\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
+	eor		\x3\().16b, \x3\().16b, \t3\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
+	eor		\x4\().16b, \x4\().16b, \t4\().16b
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
+	eor		\x5\().16b, \x5\().16b, \t5\().16b
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
+	eor		\x6\().16b, \x6\().16b, \t6\().16b
+	eor		\t1\().16b, \t1\().16b, \x0\().16b
+	eor		\x7\().16b, \x7\().16b, \t7\().16b
+	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x1\().16b
+	eor		\t0\().16b, \t0\().16b, \x7\().16b
+	eor		\t1\().16b, \t1\().16b, \x7\().16b
+	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t5\().16b, \t5\().16b, \x4\().16b
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	eor		\t6\().16b, \t6\().16b, \x5\().16b
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x3\().16b
+	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x6\().16b
+	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x2\().16b
+	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x7\().16b
+	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x7\().16b
+	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\x7\().16b, \t1\().16b, \t5\().16b
+	.ifb		\inv
+	eor		\x2\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t3\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t2\().16b
+	.else
+	eor		\t3\().16b, \t3\().16b, \x4\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x2\().16b, \x3\().16b, \t6\().16b
+	eor		\x3\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x6\().16b, \t2\().16b
+	mov		\x6\().16b, \t3\().16b
+	.endif
+	.endm
+
+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				      t0, t1, t2, t3, t4, t5, t6, t7
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t0\().16b, \t0\().16b, \x0\().16b
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t6\().16b, \t6\().16b, \x6\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x7\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t1\().16b, \t1\().16b, \x1\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x2\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x3\().16b
+	eor		\t4\().16b, \t4\().16b, \x4\().16b
+	eor		\t5\().16b, \t5\().16b, \x5\().16b
+	eor		\x0\().16b, \x0\().16b, \t6\().16b
+	eor		\x1\().16b, \x1\().16b, \t6\().16b
+	eor		\x2\().16b, \x2\().16b, \t0\().16b
+	eor		\x4\().16b, \x4\().16b, \t2\().16b
+	eor		\x3\().16b, \x3\().16b, \t1\().16b
+	eor		\x1\().16b, \x1\().16b, \t7\().16b
+	eor		\x2\().16b, \x2\().16b, \t7\().16b
+	eor		\x4\().16b, \x4\().16b, \t6\().16b
+	eor		\x5\().16b, \x5\().16b, \t3\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t7\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x7\().16b, \x7\().16b, \t5\().16b
+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+	.endm
+
+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+	ushr		\t0\().2d, \b0\().2d, #\n
+	ushr		\t1\().2d, \b1\().2d, #\n
+	eor		\t0\().16b, \t0\().16b, \a0\().16b
+	eor		\t1\().16b, \t1\().16b, \a1\().16b
+	and		\t0\().16b, \t0\().16b, \mask\().16b
+	and		\t1\().16b, \t1\().16b, \mask\().16b
+	eor		\a0\().16b, \a0\().16b, \t0\().16b
+	shl		\t0\().2d, \t0\().2d, #\n
+	eor		\a1\().16b, \a1\().16b, \t1\().16b
+	shl		\t1\().2d, \t1\().2d, #\n
+	eor		\b0\().16b, \b0\().16b, \t0\().16b
+	eor		\b1\().16b, \b1\().16b, \t1\().16b
+	.endm
+
+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+	movi		\t0\().16b, #0x55
+	movi		\t1\().16b, #0x33
+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+	movi		\t0\().16b, #0x0f
+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+	.endm
+
+
+	.align		6
+M0:	.octa		0x0004080c0105090d02060a0e03070b0f
+
+M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
+SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
+SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
+
+M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
+ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
+ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
+
+	/*
+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+	 */
+ENTRY(aesbs_convert_key)
+	ld1		{v7.4s}, [x1], #16		// load round 0 key
+	ld1		{v17.4s}, [x1], #16		// load round 1 key
+
+	movi		v8.16b,  #0x01			// bit masks
+	movi		v9.16b,  #0x02
+	movi		v10.16b, #0x04
+	movi		v11.16b, #0x08
+	movi		v12.16b, #0x10
+	movi		v13.16b, #0x20
+	movi		v14.16b, #0x40
+	movi		v15.16b, #0x80
+	ldr		q16, M0
+
+	sub		x2, x2, #1
+	str		q7, [x0], #16		// save round 0 key
+
+.Lkey_loop:
+	tbl		v7.16b ,{v17.16b}, v16.16b
+	ld1		{v17.4s}, [x1], #16		// load next round key
+
+	cmtst		v0.16b, v7.16b, v8.16b
+	cmtst		v1.16b, v7.16b, v9.16b
+	cmtst		v2.16b, v7.16b, v10.16b
+	cmtst		v3.16b, v7.16b, v11.16b
+	cmtst		v4.16b, v7.16b, v12.16b
+	cmtst		v5.16b, v7.16b, v13.16b
+	cmtst		v6.16b, v7.16b, v14.16b
+	cmtst		v7.16b, v7.16b, v15.16b
+	not		v0.16b, v0.16b
+	not		v1.16b, v1.16b
+	not		v5.16b, v5.16b
+	not		v6.16b, v6.16b
+
+	subs		x2, x2, #1
+	stp		q0, q1, [x0], #128
+	stp		q2, q3, [x0, #-96]
+	stp		q4, q5, [x0, #-64]
+	stp		q6, q7, [x0, #-32]
+	b.ne		.Lkey_loop
+
+	movi		v7.16b, #0x63			// compose .L63
+	eor		v17.16b, v17.16b, v7.16b
+	str		q17, [x0]
+	ret
+ENDPROC(aesbs_convert_key)
+
+	.align		4
+aesbs_encrypt8:
+	ldr		q9, [bskey], #16		// round 0 key
+	ldr		q8, M0SR
+	ldr		q24, SR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Lenc_sbox
+
+.Lenc_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Lenc_done
+
+	enc_next_rk
+
+	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
+
+	b.ne		.Lenc_loop
+	ldr		q24, SRM0
+	b		.Lenc_loop
+
+.Lenc_done:
+	ldr		q12, [bskey]			// last round key
+
+	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_encrypt8)
+
+	.align		4
+aesbs_decrypt8:
+	lsl		x9, rounds, #7
+	add		bskey, bskey, x9
+
+	ldr		q9, [bskey, #-112]!		// round 0 key
+	ldr		q8, M0ISR
+	ldr		q24, ISR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Ldec_sbox
+
+.Ldec_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Ldec_done
+
+	dec_next_rk
+
+	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
+
+	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	b.ne		.Ldec_loop
+	ldr		q24, ISRM0
+	b		.Ldec_loop
+.Ldec_done:
+	ldr		q12, [bskey, #-16]		// last round key
+
+	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_decrypt8)
+
+	/*
+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 */
+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+99:	mov		x5, #1
+	lsl		x5, x5, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x5, x5, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	tbnz		x5, #1, 0f
+	ld1		{v1.16b}, [x1], #16
+	tbnz		x5, #2, 0f
+	ld1		{v2.16b}, [x1], #16
+	tbnz		x5, #3, 0f
+	ld1		{v3.16b}, [x1], #16
+	tbnz		x5, #4, 0f
+	ld1		{v4.16b}, [x1], #16
+	tbnz		x5, #5, 0f
+	ld1		{v5.16b}, [x1], #16
+	tbnz		x5, #6, 0f
+	ld1		{v6.16b}, [x1], #16
+	tbnz		x5, #7, 0f
+	ld1		{v7.16b}, [x1], #16
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		\do8
+
+	st1		{\o0\().16b}, [x0], #16
+	tbnz		x5, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	tbnz		x5, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	tbnz		x5, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	tbnz		x5, #4, 1f
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x5, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x5, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x5, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	ldp		x29, x30, [sp], #16
+	ret
+	.endm
+
+	.align		4
+ENTRY(aesbs_ecb_encrypt)
+	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+	.align		4
+ENTRY(aesbs_ecb_decrypt)
+	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+	/*
+	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+	.align		4
+ENTRY(aesbs_cbc_decrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+99:	mov		x6, #1
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x6, x6, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	mov		v25.16b, v0.16b
+	tbnz		x6, #1, 0f
+	ld1		{v1.16b}, [x1], #16
+	mov		v26.16b, v1.16b
+	tbnz		x6, #2, 0f
+	ld1		{v2.16b}, [x1], #16
+	mov		v27.16b, v2.16b
+	tbnz		x6, #3, 0f
+	ld1		{v3.16b}, [x1], #16
+	mov		v28.16b, v3.16b
+	tbnz		x6, #4, 0f
+	ld1		{v4.16b}, [x1], #16
+	mov		v29.16b, v4.16b
+	tbnz		x6, #5, 0f
+	ld1		{v5.16b}, [x1], #16
+	mov		v30.16b, v5.16b
+	tbnz		x6, #6, 0f
+	ld1		{v6.16b}, [x1], #16
+	mov		v31.16b, v6.16b
+	tbnz		x6, #7, 0f
+	ld1		{v7.16b}, [x1]
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		aesbs_decrypt8
+
+	ld1		{v24.16b}, [x5]			// load IV
+
+	eor		v1.16b, v1.16b, v25.16b
+	eor		v6.16b, v6.16b, v26.16b
+	eor		v4.16b, v4.16b, v27.16b
+	eor		v2.16b, v2.16b, v28.16b
+	eor		v7.16b, v7.16b, v29.16b
+	eor		v0.16b, v0.16b, v24.16b
+	eor		v3.16b, v3.16b, v30.16b
+	eor		v5.16b, v5.16b, v31.16b
+
+	st1		{v0.16b}, [x0], #16
+	mov		v24.16b, v25.16b
+	tbnz		x6, #1, 1f
+	st1		{v1.16b}, [x0], #16
+	mov		v24.16b, v26.16b
+	tbnz		x6, #2, 1f
+	st1		{v6.16b}, [x0], #16
+	mov		v24.16b, v27.16b
+	tbnz		x6, #3, 1f
+	st1		{v4.16b}, [x0], #16
+	mov		v24.16b, v28.16b
+	tbnz		x6, #4, 1f
+	st1		{v2.16b}, [x0], #16
+	mov		v24.16b, v29.16b
+	tbnz		x6, #5, 1f
+	st1		{v7.16b}, [x0], #16
+	mov		v24.16b, v30.16b
+	tbnz		x6, #6, 1f
+	st1		{v3.16b}, [x0], #16
+	mov		v24.16b, v31.16b
+	tbnz		x6, #7, 1f
+	ld1		{v24.16b}, [x1], #16
+	st1		{v5.16b}, [x0], #16
+1:	st1		{v24.16b}, [x5]			// store IV
+
+	cbnz		x4, 99b
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(aesbs_cbc_decrypt)
+
+	.macro		next_tweak, out, in, const, tmp
+	sshr		\tmp\().2d,  \in\().2d,   #63
+	and		\tmp\().16b, \tmp\().16b, \const\().16b
+	add		\out\().2d,  \in\().2d,   \in\().2d
+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+	eor		\out\().16b, \out\().16b, \tmp\().16b
+	.endm
+
+	.align		4
+.Lxts_mul_x:
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)
+
+	/*
+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+__xts_crypt8:
+	mov		x6, #1
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x6, x6, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	next_tweak	v26, v25, v30, v31
+	eor		v0.16b, v0.16b, v25.16b
+	tbnz		x6, #1, 0f
+
+	ld1		{v1.16b}, [x1], #16
+	next_tweak	v27, v26, v30, v31
+	eor		v1.16b, v1.16b, v26.16b
+	tbnz		x6, #2, 0f
+
+	ld1		{v2.16b}, [x1], #16
+	next_tweak	v28, v27, v30, v31
+	eor		v2.16b, v2.16b, v27.16b
+	tbnz		x6, #3, 0f
+
+	ld1		{v3.16b}, [x1], #16
+	next_tweak	v29, v28, v30, v31
+	eor		v3.16b, v3.16b, v28.16b
+	tbnz		x6, #4, 0f
+
+	ld1		{v4.16b}, [x1], #16
+	str		q29, [sp, #16]
+	eor		v4.16b, v4.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #5, 0f
+
+	ld1		{v5.16b}, [x1], #16
+	str		q29, [sp, #32]
+	eor		v5.16b, v5.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #6, 0f
+
+	ld1		{v6.16b}, [x1], #16
+	str		q29, [sp, #48]
+	eor		v6.16b, v6.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #7, 0f
+
+	ld1		{v7.16b}, [x1], #16
+	str		q29, [sp, #64]
+	eor		v7.16b, v7.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	br		x7
+ENDPROC(__xts_crypt8)
+
+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-80]!
+	mov		x29, sp
+
+	ldr		q30, .Lxts_mul_x
+	ld1		{v25.16b}, [x5]
+
+99:	adr		x7, \do8
+	bl		__xts_crypt8
+
+	ldp		q16, q17, [sp, #16]
+	ldp		q18, q19, [sp, #48]
+
+	eor		\o0\().16b, \o0\().16b, v25.16b
+	eor		\o1\().16b, \o1\().16b, v26.16b
+	eor		\o2\().16b, \o2\().16b, v27.16b
+	eor		\o3\().16b, \o3\().16b, v28.16b
+
+	st1		{\o0\().16b}, [x0], #16
+	mov		v25.16b, v26.16b
+	tbnz		x6, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	mov		v25.16b, v27.16b
+	tbnz		x6, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	mov		v25.16b, v28.16b
+	tbnz		x6, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	mov		v25.16b, v29.16b
+	tbnz		x6, #4, 1f
+
+	eor		\o4\().16b, \o4\().16b, v16.16b
+	eor		\o5\().16b, \o5\().16b, v17.16b
+	eor		\o6\().16b, \o6\().16b, v18.16b
+	eor		\o7\().16b, \o7\().16b, v19.16b
+
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x6, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x6, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x6, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	st1		{v25.16b}, [x5]
+	ldp		x29, x30, [sp], #80
+	ret
+	.endm
+
+ENTRY(aesbs_xts_encrypt)
+	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+	.macro		next_ctr, v
+	mov		\v\().d[1], x8
+	adds		x8, x8, #1
+	mov		\v\().d[0], x7
+	adc		x7, x7, xzr
+	rev64		\v\().16b, \v\().16b
+	.endm
+
+	/*
+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+	 *		     int rounds, int blocks, u8 iv[], u8 final[])
+	 */
+ENTRY(aesbs_ctr_encrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	cmp		x6, #0
+	cset		x10, ne
+	add		x4, x4, x10		// do one extra block if final
+
+	ldp		x7, x8, [x5]
+	ld1		{v0.16b}, [x5]
+CPU_LE(	rev		x7, x7		)
+CPU_LE(	rev		x8, x8		)
+	adds		x8, x8, #1
+	adc		x7, x7, xzr
+
+99:	mov		x9, #1
+	lsl		x9, x9, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x9, x9, xzr, le
+
+	tbnz		x9, #1, 0f
+	next_ctr	v1
+	tbnz		x9, #2, 0f
+	next_ctr	v2
+	tbnz		x9, #3, 0f
+	next_ctr	v3
+	tbnz		x9, #4, 0f
+	next_ctr	v4
+	tbnz		x9, #5, 0f
+	next_ctr	v5
+	tbnz		x9, #6, 0f
+	next_ctr	v6
+	tbnz		x9, #7, 0f
+	next_ctr	v7
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		aesbs_encrypt8
+
+	lsr		x9, x9, x10		// disregard the extra block
+	tbnz		x9, #0, 0f
+
+	ld1		{v8.16b}, [x1], #16
+	eor		v0.16b, v0.16b, v8.16b
+	st1		{v0.16b}, [x0], #16
+	tbnz		x9, #1, 1f
+
+	ld1		{v9.16b}, [x1], #16
+	eor		v1.16b, v1.16b, v9.16b
+	st1		{v1.16b}, [x0], #16
+	tbnz		x9, #2, 2f
+
+	ld1		{v10.16b}, [x1], #16
+	eor		v4.16b, v4.16b, v10.16b
+	st1		{v4.16b}, [x0], #16
+	tbnz		x9, #3, 3f
+
+	ld1		{v11.16b}, [x1], #16
+	eor		v6.16b, v6.16b, v11.16b
+	st1		{v6.16b}, [x0], #16
+	tbnz		x9, #4, 4f
+
+	ld1		{v12.16b}, [x1], #16
+	eor		v3.16b, v3.16b, v12.16b
+	st1		{v3.16b}, [x0], #16
+	tbnz		x9, #5, 5f
+
+	ld1		{v13.16b}, [x1], #16
+	eor		v7.16b, v7.16b, v13.16b
+	st1		{v7.16b}, [x0], #16
+	tbnz		x9, #6, 6f
+
+	ld1		{v14.16b}, [x1], #16
+	eor		v2.16b, v2.16b, v14.16b
+	st1		{v2.16b}, [x0], #16
+	tbnz		x9, #7, 7f
+
+	ld1		{v15.16b}, [x1], #16
+	eor		v5.16b, v5.16b, v15.16b
+	st1		{v5.16b}, [x0], #16
+
+8:	next_ctr	v0
+	cbnz		x4, 99b
+
+0:	st1		{v0.16b}, [x5]
+	ldp		x29, x30, [sp], #16
+	ret
+
+	/*
+	 * If we are handling the tail of the input (x6 != NULL), return the
+	 * final keystream block back to the caller.
+	 */
+1:	cbz		x6, 8b
+	st1		{v1.16b}, [x6]
+	b		8b
+2:	cbz		x6, 8b
+	st1		{v4.16b}, [x6]
+	b		8b
+3:	cbz		x6, 8b
+	st1		{v6.16b}, [x6]
+	b		8b
+4:	cbz		x6, 8b
+	st1		{v3.16b}, [x6]
+	b		8b
+5:	cbz		x6, 8b
+	st1		{v7.16b}, [x6]
+	b		8b
+6:	cbz		x6, 8b
+	st1		{v2.16b}, [x6]
+	b		8b
+7:	cbz		x6, 8b
+	st1		{v5.16b}, [x6]
+	b		8b
+ENDPROC(aesbs_ctr_encrypt)
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@ -0,0 +1,439 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[], u8 final[]);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+/* borrowed from aes-neon-blk.ko */
+asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
+				     int rounds, int blocks, int first);
+asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
+				     int rounds, int blocks, u8 iv[],
+				     int first);
+
+struct aesbs_ctx {
+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32];
+	int	rounds;
+} __aligned(AES_BLOCK_SIZE);
+
+struct aesbs_cbc_ctx {
+	struct aesbs_ctx	key;
+	u32			enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+	struct aesbs_ctx	key;
+	u32			twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 4;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+		   ctx->rounds, blocks);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->key.rounds = 6 + key_len / 4;
+
+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err, first = 1;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		/* fall back to the non-bitsliced NEON implementation */
+		neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				     ctx->enc, ctx->key.rounds, blocks, walk.iv,
+				     first);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+		first = 0;
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->key.rk, ctx->key.rounds, blocks,
+				  walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u8 buf[AES_BLOCK_SIZE];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+			final = NULL;
+		}
+
+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+		if (final) {
+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+			if (dst != src)
+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+
+			err = skcipher_walk_done(&walk, 0);
+			break;
+		}
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = xts_verify_key(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	key_len /= 2;
+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	if (err)
+		return err;
+
+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+	return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+
+	neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
+			     ctx->key.rounds, 1, 1);
+
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+		   ctx->key.rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+	.base.cra_name		= "__ecb(aes)",
+	.base.cra_driver_name	= "__ecb-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ecb_encrypt,
+	.decrypt		= ecb_decrypt,
+}, {
+	.base.cra_name		= "__cbc(aes)",
+	.base.cra_driver_name	= "__cbc-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_cbc_setkey,
+	.encrypt		= cbc_encrypt,
+	.decrypt		= cbc_decrypt,
+}, {
+	.base.cra_name		= "__ctr(aes)",
+	.base.cra_driver_name	= "__ctr-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "ctr(aes)",
+	.base.cra_driver_name	= "ctr-aes-neonbs",
+	.base.cra_priority	= 250 - 1,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "__xts(aes)",
+	.base.cra_driver_name	= "__xts-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_xts_setkey,
+	.encrypt		= xts_encrypt,
+	.decrypt		= xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);
+
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+	struct simd_skcipher_alg *simd;
+	const char *basename;
+	const char *algname;
+	const char *drvname;
+	int err;
+	int i;
+
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
+		algname = aes_algs[i].base.cra_name + 2;
+		drvname = aes_algs[i].base.cra_driver_name + 2;
+		basename = aes_algs[i].base.cra_driver_name;
+		simd = simd_skcipher_create_compat(algname, drvname, basename);
+		err = PTR_ERR(simd);
+		if (IS_ERR(simd))
+			goto unregister_simds;
+
+		aes_simd_algs[i] = simd;
+	}
+	return 0;
+
+unregister_simds:
+	aes_exit();
+	return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@ -0,0 +1,450 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		6
+
+ENTRY(chacha20_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requires shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	adr		x3, ROT8
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+	ld1		{v12.4s}, [x3]
+
+	mov		x3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		6
+ENTRY(chacha20_4block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	adr		x3, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x3]
+
+	// x0..15[0-3] = s0..3[0..3]
+	mov		x4, x0
+	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{v12.4s-v15.4s}, [x4]
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	mov		x3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+	rev32		v15.8h, v15.8h
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	sri		v5.4s, v17.4s, #20
+	sri		v6.4s, v18.4s, #20
+	sri		v7.4s, v19.4s, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+	tbl		v15.16b, {v15.16b}, v31.16b
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	sri		v5.4s, v17.4s, #25
+	sri		v6.4s, v18.4s, #25
+	sri		v7.4s, v19.4s, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	rev32		v15.8h, v15.8h
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	sri		v6.4s, v17.4s, #20
+	sri		v7.4s, v18.4s, #20
+	sri		v4.4s, v19.4s, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	sri		v6.4s, v17.4s, #25
+	sri		v7.4s, v18.4s, #25
+	sri		v4.4s, v19.4s, #25
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	add		v1.4s, v1.4s, v17.4s
+	add		v2.4s, v2.4s, v18.4s
+	add		v3.4s, v3.4s, v19.4s
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	add		v5.4s, v5.4s, v21.4s
+	add		v6.4s, v6.4s, v22.4s
+	add		v7.4s, v7.4s, v23.4s
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	add		v9.4s, v9.4s, v25.4s
+	add		v10.4s, v10.4s, v26.4s
+	add		v11.4s, v11.4s, v27.4s
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	add		v13.4s, v13.4s, v29.4s
+	add		v14.4s, v14.4s, v30.4s
+	add		v15.4s, v15.4s, v31.4s
+
+	// interleave 32-bit words in state n, n+1
+	zip1		v16.4s, v0.4s, v1.4s
+	zip2		v17.4s, v0.4s, v1.4s
+	zip1		v18.4s, v2.4s, v3.4s
+	zip2		v19.4s, v2.4s, v3.4s
+	zip1		v20.4s, v4.4s, v5.4s
+	zip2		v21.4s, v4.4s, v5.4s
+	zip1		v22.4s, v6.4s, v7.4s
+	zip2		v23.4s, v6.4s, v7.4s
+	zip1		v24.4s, v8.4s, v9.4s
+	zip2		v25.4s, v8.4s, v9.4s
+	zip1		v26.4s, v10.4s, v11.4s
+	zip2		v27.4s, v10.4s, v11.4s
+	zip1		v28.4s, v12.4s, v13.4s
+	zip2		v29.4s, v12.4s, v13.4s
+	zip1		v30.4s, v14.4s, v15.4s
+	zip2		v31.4s, v14.4s, v15.4s
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	ld1		{v16.16b-v19.16b}, [x2], #64
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	ld1		{v20.16b-v23.16b}, [x2], #64
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	ld1		{v24.16b-v27.16b}, [x2], #64
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+	eor		v28.16b, v28.16b, v12.16b
+	st1		{v24.16b-v27.16b}, [x1], #64
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+	st1		{v28.16b-v31.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_4block_xor_neon)
+
+CTRINC:	.word		0, 1, 2, 3
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@ -0,0 +1,126 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
+		return crypto_chacha20_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha20_init(state, ctx, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+				nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-neon",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_neon,
+	.decrypt		= chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
--- a/arch/arm64/crypto/crc32-arm64.c
+++ b/arch/arm64/crypto/crc32-arm64.c
@ -1,290 +0,0 @@
-/*
- * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions
- *
- * Module based on crypto/crc32c_generic.c
- *
- * CRC32 loop taken from Ed Nevill's Hadoop CRC patch
- * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E
- *
- * Using inline assembly instead of intrinsics in order to be backwards
- * compatible with older compilers.
- *
- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/cpufeature.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>");
-MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions");
-MODULE_LICENSE("GPL v2");
-
-#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-
-static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
-	s64 length = len;
-
-	while ((length -= sizeof(u64)) >= 0) {
-		CRC32X(crc, get_unaligned_le64(p));
-		p += sizeof(u64);
-	}
-
-	/* The following is more efficient than the straight loop */
-	if (length & sizeof(u32)) {
-		CRC32W(crc, get_unaligned_le32(p));
-		p += sizeof(u32);
-	}
-	if (length & sizeof(u16)) {
-		CRC32H(crc, get_unaligned_le16(p));
-		p += sizeof(u16);
-	}
-	if (length & sizeof(u8))
-		CRC32B(crc, *p);
-
-	return crc;
-}
-
-static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
-	s64 length = len;
-
-	while ((length -= sizeof(u64)) >= 0) {
-		CRC32CX(crc, get_unaligned_le64(p));
-		p += sizeof(u64);
-	}
-
-	/* The following is more efficient than the straight loop */
-	if (length & sizeof(u32)) {
-		CRC32CW(crc, get_unaligned_le32(p));
-		p += sizeof(u32);
-	}
-	if (length & sizeof(u16)) {
-		CRC32CH(crc, get_unaligned_le16(p));
-		p += sizeof(u16);
-	}
-	if (length & sizeof(u8))
-		CRC32CB(crc, *p);
-
-	return crc;
-}
-
-#define CHKSUM_BLOCK_SIZE	1
-#define CHKSUM_DIGEST_SIZE	4
-
-struct chksum_ctx {
-	u32 key;
-};
-
-struct chksum_desc_ctx {
-	u32 crc;
-};
-
-static int chksum_init(struct shash_desc *desc)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = mctx->key;
-
-	return 0;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
-			 unsigned int keylen)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
-
-	if (keylen != sizeof(mctx->key)) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	mctx->key = get_unaligned_le32(key);
-	return 0;
-}
-
-static int chksum_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length);
-	return 0;
-}
-
-static int chksumc_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length);
-	return 0;
-}
-
-static int chksum_final(struct shash_desc *desc, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	put_unaligned_le32(ctx->crc, out);
-	return 0;
-}
-
-static int chksumc_final(struct shash_desc *desc, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	put_unaligned_le32(~ctx->crc, out);
-	return 0;
-}
-
-static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
-	put_unaligned_le32(crc32_arm64_le_hw(crc, data, len), out);
-	return 0;
-}
-
-static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
-	put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out);
-	return 0;
-}
-
-static int chksum_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	return __chksum_finup(ctx->crc, data, len, out);
-}
-
-static int chksumc_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	return __chksumc_finup(ctx->crc, data, len, out);
-}
-
-static int chksum_digest(struct shash_desc *desc, const u8 *data,
-			 unsigned int length, u8 *out)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
-	return __chksum_finup(mctx->key, data, length, out);
-}
-
-static int chksumc_digest(struct shash_desc *desc, const u8 *data,
-			 unsigned int length, u8 *out)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
-	return __chksumc_finup(mctx->key, data, length, out);
-}
-
-static int crc32_cra_init(struct crypto_tfm *tfm)
-{
-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = 0;
-	return 0;
-}
-
-static int crc32c_cra_init(struct crypto_tfm *tfm)
-{
-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = ~0;
-	return 0;
-}
-
-static struct shash_alg crc32_alg = {
-	.digestsize		=	CHKSUM_DIGEST_SIZE,
-	.setkey			=	chksum_setkey,
-	.init			=	chksum_init,
-	.update			=	chksum_update,
-	.final			=	chksum_final,
-	.finup			=	chksum_finup,
-	.digest			=	chksum_digest,
-	.descsize		=	sizeof(struct chksum_desc_ctx),
-	.base			=	{
-		.cra_name		=	"crc32",
-		.cra_driver_name	=	"crc32-arm64-hw",
-		.cra_priority		=	300,
-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
-		.cra_alignmask		=	0,
-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
-		.cra_module		=	THIS_MODULE,
-		.cra_init		=	crc32_cra_init,
-	}
-};
-
-static struct shash_alg crc32c_alg = {
-	.digestsize		=	CHKSUM_DIGEST_SIZE,
-	.setkey			=	chksum_setkey,
-	.init			=	chksum_init,
-	.update			=	chksumc_update,
-	.final			=	chksumc_final,
-	.finup			=	chksumc_finup,
-	.digest			=	chksumc_digest,
-	.descsize		=	sizeof(struct chksum_desc_ctx),
-	.base			=	{
-		.cra_name		=	"crc32c",
-		.cra_driver_name	=	"crc32c-arm64-hw",
-		.cra_priority		=	300,
-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
-		.cra_alignmask		=	0,
-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
-		.cra_module		=	THIS_MODULE,
-		.cra_init		=	crc32c_cra_init,
-	}
-};
-
-static int __init crc32_mod_init(void)
-{
-	int err;
-
-	err = crypto_register_shash(&crc32_alg);
-
-	if (err)
-		return err;
-
-	err = crypto_register_shash(&crc32c_alg);
-
-	if (err) {
-		crypto_unregister_shash(&crc32_alg);
-		return err;
-	}
-
-	return 0;
-}
-
-static void __exit crc32_mod_exit(void)
-{
-	crypto_unregister_shash(&crc32_alg);
-	crypto_unregister_shash(&crc32c_alg);
-}
-
-module_cpu_feature_match(CRC32, crc32_mod_init);
-module_exit(crc32_mod_exit);
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@ -72,6 +72,24 @@ static int crc32_pmull_init(struct shash_desc *desc)
 	return 0;
 }

+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32c_armv8_le(*crc, data, length);
+	return 0;
+}
+
 static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int length)
 {
@ -156,7 +174,7 @@ static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
 static struct shash_alg crc32_pmull_algs[] = { {
 	.setkey			= crc32_pmull_setkey,
 	.init			= crc32_pmull_init,
-	.update			= crc32_pmull_update,
+	.update			= crc32_update,
 	.final			= crc32_pmull_final,
 	.descsize		= sizeof(u32),
 	.digestsize		= sizeof(u32),
@ -171,7 +189,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 }, {
 	.setkey			= crc32_pmull_setkey,
 	.init			= crc32_pmull_init,
-	.update			= crc32c_pmull_update,
+	.update			= crc32c_update,
 	.final			= crc32c_pmull_final,
 	.descsize		= sizeof(u32),
 	.digestsize		= sizeof(u32),
@ -187,14 +205,20 @@ static struct shash_alg crc32_pmull_algs[] = { {

 static int __init crc32_pmull_mod_init(void)
 {
-	if (elf_hwcap & HWCAP_CRC32) {
-		fallback_crc32 = crc32_armv8_le;
-		fallback_crc32c = crc32c_armv8_le;
-	} else {
-		fallback_crc32 = crc32_le;
-		fallback_crc32c = __crc32c_le;
-	}
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+		crc32_pmull_algs[0].update = crc32_pmull_update;
+		crc32_pmull_algs[1].update = crc32c_pmull_update;

+		if (elf_hwcap & HWCAP_CRC32) {
+			fallback_crc32 = crc32_armv8_le;
+			fallback_crc32c = crc32c_armv8_le;
+		} else {
+			fallback_crc32 = crc32_le;
+			fallback_crc32c = __crc32c_le;
+		}
+	} else if (!(elf_hwcap & HWCAP_CRC32)) {
+		return -ENODEV;
+	}
 	return crypto_register_shashes(crc32_pmull_algs,
 				       ARRAY_SIZE(crc32_pmull_algs));
 }
@ -205,7 +229,12 @@ static void __exit crc32_pmull_mod_exit(void)
 				  ARRAY_SIZE(crc32_pmull_algs));
 }

-module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+static const struct cpu_feature crc32_cpu_feature[] = {
+	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
+
+module_init(crc32_pmull_mod_init);
 module_exit(crc32_pmull_mod_exit);

 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@ -46,27 +46,48 @@

 #ifdef __x86_64__

-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
 .align 16
 .Lgf128mul_x_ble_mask:
 	.octa 0x00000000000000010000000000000087
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
 POLY:   .octa 0xC2000000000000000000000000000001
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE: .octa 0x00000001000000000000000000000001

+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+.section	.rodata.cst16.MASK1, "aM", @progbits, 16
+.align 16
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+.section	.rodata.cst16.MASK2, "aM", @progbits, 16
+.align 16
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE:        .octa 0x00000000000000000000000000000001
+.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
+.align 16
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+.section	.rodata.cst16.dec, "aM", @progbits, 16
+.align 16
+dec:        .octa 0x1
+.section	.rodata.cst16.enc, "aM", @progbits, 16
+.align 16
+enc:        .octa 0x2
+
 # order of these constants should not change.
 # more specifically, ALL_F should follow SHIFT_MASK,
-# and ZERO should follow ALL_F
-
-SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
-MASK1:      .octa 0x0000000000000000ffffffffffffffff
-MASK2:      .octa 0xffffffffffffffff0000000000000000
+# and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
-ZERO:       .octa 0x00000000000000000000000000000000
-ONE:        .octa 0x00000000000000000000000000000001
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-dec:        .octa 0x1
-enc:        .octa 0x2
+            .octa 0x00000000000000000000000000000000


 .text
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@ -122,22 +122,38 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>

-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
 .align 16
-
 POLY:            .octa     0xC2000000000000000000000000000001
+
+.section	.rodata.cst16.POLY2, "aM", @progbits, 16
+.align 16
 POLY2:           .octa     0xC20000000000000000000001C2000000
+
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE:          .octa     0x00000001000000000000000000000001

-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
-
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
+
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE:             .octa     0x00000000000000000000000000000001
+
+.section	.rodata.cst16.ONEf, "aM", @progbits, 16
+.align 16
+ONEf:            .octa     0x01000000000000000000000000000000
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
 SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
-ZERO:            .octa     0x00000000000000000000000000000000
-ONE:             .octa     0x00000000000000000000000000000001
-ONEf:            .octa     0x01000000000000000000000000000000
+                 .octa     0x00000000000000000000000000000000

 .text

--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@ -740,9 +740,11 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;

 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
@ -822,9 +824,11 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;

 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@ -571,7 +571,9 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 16(rio); \
 	vmovdqu y7, 15 * 16(rio);

-.data
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16

 #define SHUFB_BYTES(idx) \
@ -711,6 +713,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03

 /* 4-bit mask */
+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@ -610,20 +610,25 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 32(rio); \
 	vmovdqu y7, 15 * 32(rio);

-.data
-.align 32

+.section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
-
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)

+.section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
 .Lpack_bswap:
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080

+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
+.align 16
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@ -750,6 +755,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03

+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@ -195,19 +195,29 @@
 	vpshufb rmask,	x0, x0;           \
 	vpshufb rmask,	x1, x1;

-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
 .Lbswap_iv_mask:
 	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+.section	.rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0

--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@ -225,8 +225,7 @@
 	vpshufb rmask,		x2, x2;       \
 	vpshufb rmask,		x3, x3;

-.data
-
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
@ -244,10 +243,19 @@
 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+
+.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0

--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@ -11,13 +11,18 @@

 #include <linux/linkage.h>

-.data
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
 .align 32
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
 	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
 	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
 CTRINC:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004

--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@ -11,11 +11,14 @@

 #include <linux/linkage.h>

-.data
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
 CTRINC:	.octa 0x00000003000000020000000100000000

 .text
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@ -11,7 +11,7 @@

 #include <crypto/algapi.h>
 #include <crypto/chacha20.h>
-#include <linux/crypto.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@ -63,36 +63,37 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }

-static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
-			 struct scatterlist *src, unsigned int nbytes)
+static int chacha20_simd(struct skcipher_request *req)
 {
-	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	struct skcipher_walk walk;
 	int err;

-	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);

-	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(req);

-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, true);

-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+	crypto_chacha20_init(state, ctx, walk.iv);

 	kernel_fpu_begin();

 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = blkcipher_walk_done(desc, &walk,
-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}

 	if (walk.nbytes) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		err = skcipher_walk_done(&walk, 0);
 	}

 	kernel_fpu_end();
@ -100,27 +101,22 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }

-static struct crypto_alg alg = {
-	.cra_name		= "chacha20",
-	.cra_driver_name	= "chacha20-simd",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
-	.cra_alignmask		= sizeof(u32) - 1,
-	.cra_module		= THIS_MODULE,
-	.cra_u			= {
-		.blkcipher = {
-			.min_keysize	= CHACHA20_KEY_SIZE,
-			.max_keysize	= CHACHA20_KEY_SIZE,
-			.ivsize		= CHACHA20_IV_SIZE,
-			.geniv		= "seqiv",
-			.setkey		= crypto_chacha20_setkey,
-			.encrypt	= chacha20_simd,
-			.decrypt	= chacha20_simd,
-		},
-	},
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-simd",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_alignmask	= sizeof(u32) - 1,
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_simd,
+	.decrypt		= chacha20_simd,
 };

 static int __init chacha20_simd_mod_init(void)
@ -133,12 +129,12 @@ static int __init chacha20_simd_mod_init(void)
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }

 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }

 module_init(chacha20_simd_mod_init);
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@ -312,7 +312,7 @@ do_return:
        ret
 ENDPROC(crc_pcl)

-.section	.rodata, "a", %progbits
+.section	.rodata, "a", @progbits
        ################################################################
        ## jump table        Table is 129 entries x 2 bytes each
        ################################################################
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@ -554,12 +554,11 @@ _only_less_than_2:

 ENDPROC(crc_t10dif_pcl)

-.data
-
+.section	.rodata, "a", @progbits
+.align 16
 # precomputed constants
 # these constants are precomputed from the poly:
 # 0x8bb70000 (0x8bb7 scaled to 32 bits)
-.align 16
 # Q = 0x18BB70000
 # rk1 = 2^(32*3) mod Q << 32
 # rk2 = 2^(32*5) mod Q << 32
@ -613,14 +612,23 @@ rk20:



+.section	.rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
 mask1:
 .octa 0x80808080808080808080808080808080
+
+.section	.rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
 mask2:
 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF

+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:
 .octa 0x000102030405060708090A0B0C0D0E0F

+.section	.rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
+.align 32
 pshufb_shf_table:
 # use these values for shift constants for the pshufb instruction
 # different alignments result in values as shown:
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@ -537,7 +537,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	ret;
 ENDPROC(des3_ede_x86_64_crypt_blk_3way)

-.data
+.section	.rodata, "a", @progbits
 .align 16
 .L_s1:
 	.quad 0x0010100001010400, 0x0000000000000000
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@ -20,8 +20,7 @@
 #include <asm/inst.h>
 #include <asm/frame.h>

-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.octa 0x000102030405060708090a0b0c0d0e0f
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@ -11,11 +11,13 @@

 #include <linux/linkage.h>

-.data
+.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
 .align 32
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
+.align 32
 ORMASK:	.octa 0x00000000010000000000000001000000
 	.octa 0x00000000010000000000000001000000

--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@ -11,10 +11,12 @@

 #include <linux/linkage.h>

-.data
+.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
 .align 16
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
+.align 16
 ORMASK:	.octa 0x00000000010000000000000001000000

 .text
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@ -29,11 +29,12 @@

 .file "serpent-avx-x86_64-asm_64.S"

-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0

--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@ -20,13 +20,18 @@

 .file "serpent-avx2-asm_64.S"

-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_0:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_1:
 	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0

--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@ -281,11 +281,13 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha1_mb_mgr_get_comp_job_avx2)

-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
 two:
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@ -203,8 +203,7 @@ return_null:

 ENDPROC(sha1_mb_mgr_submit_avx2)

-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
--- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@ -461,21 +461,32 @@ lloop:
 ENDPROC(sha1_x8_avx2)


-.data
-
+.section	.rodata.cst32.K00_19, "aM", @progbits, 32
 .align 32
 K00_19:
 .octa 0x5A8279995A8279995A8279995A827999
 .octa 0x5A8279995A8279995A8279995A827999
+
+.section	.rodata.cst32.K20_39, "aM", @progbits, 32
+.align 32
 K20_39:
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+
+.section	.rodata.cst32.K40_59, "aM", @progbits, 32
+.align 32
 K40_59:
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+
+.section	.rodata.cst32.K60_79, "aM", @progbits, 32
+.align 32
 K60_79:
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@ -293,10 +293,12 @@ ENTRY(sha1_ni_transform)
 	ret
 ENDPROC(sha1_ni_transform)

-.data
-
-.align 64
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
+
+.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
 UPPER_WORD_MASK:
 	.octa 0xFFFFFFFF000000000000000000000000
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@ -463,7 +463,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_avx)

-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@ -483,14 +483,21 @@ K256:
 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203

+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100

+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@ -723,7 +723,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_rorx)

-.data
+.section	.rodata.cst512.K256, "aM", @progbits, 512
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@ -759,14 +759,21 @@ K256:
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203

 # shuffle xBxA -> 00BA
+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100

 # shuffle xDxC -> DC00
+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@ -284,11 +284,13 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha256_mb_mgr_get_comp_job_avx2)

-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad	1
 two:
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@ -208,8 +208,7 @@ return_null:

 ENDPROC(sha256_mb_mgr_submit_avx2)

-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
--- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@ -437,7 +437,8 @@ Lrounds_16_xx:

 	ret
 ENDPROC(sha256_x8_avx2)
-.data
+
+.section	.rodata.K256_8, "a", @progbits
 .align 64
 K256_8:
 	.octa	0x428a2f98428a2f98428a2f98428a2f98
@ -568,10 +569,14 @@ K256_8:
 	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203

+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 .global K256
 K256:
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@ -474,7 +474,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_ssse3)

-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@ -494,13 +494,19 @@ K256:
        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203

+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100

+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@ -329,7 +329,7 @@ ENTRY(sha256_ni_transform)
 	ret
 ENDPROC(sha256_ni_transform)

-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@ -349,5 +349,7 @@ K256:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@ -370,14 +370,17 @@ ENDPROC(sha512_transform_avx)
 ########################################################################
 ### Binary Data

-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607

+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@ -684,8 +684,11 @@ ENDPROC(sha512_transform_rorx)
 ########################################################################
 ### Binary Data

-.data

+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
 .align 64
 # K[t] used in SHA512 hashing
 K512:
@ -730,14 +733,17 @@ K512:
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817

+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 .align 32
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 	.octa 0x18191a1b1c1d1e1f1011121314151617

+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
 MASK_YMM_LO:
 	.octa 0x00000000000000000000000000000000
 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
 #endif
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@ -221,7 +221,7 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
 }

 static struct sha512_hash_ctx
-		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
+		*sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
 {
 	/*
 	 * If get_comp_job returns NULL, there are no jobs complete.
@ -233,11 +233,17 @@ static struct sha512_hash_ctx
 	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
 	 * still need processing.
 	 */
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;

+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	ctx = (struct sha512_hash_ctx *)
 				sha512_job_mgr_get_comp_job(&mgr->mgr);
-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }

 static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
@ -246,12 +252,17 @@ static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
 }

 static struct sha512_hash_ctx
-			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
+			*sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
 					  struct sha512_hash_ctx *ctx,
 					  const void *buffer,
 					  uint32_t len,
 					  int flags)
 {
+	struct sha512_ctx_mgr *mgr;
+	unsigned long irqflags;
+
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, irqflags);
 	if (flags & (~HASH_ENTIRE)) {
 		/*
 		 * User should not pass anything other than FIRST, UPDATE, or
@ -351,20 +362,26 @@ static struct sha512_hash_ctx
 		}
 	}

-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, irqflags);
+	return ctx;
 }

-static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
 {
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;

+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	while (1) {
 		ctx = (struct sha512_hash_ctx *)
 					sha512_job_mgr_flush(&mgr->mgr);

 		/* If flush returned 0, there are no more jobs in flight. */
 		if (!ctx)
-			return NULL;
+			break;

 		/*
 		 * If flush returned a job, resubmit the job to finish
@ -378,8 +395,10 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
 		 * the sha512_ctx_mgr still need processing. Loop.
 		 */
 		if (ctx)
-			return ctx;
+			break;
 	}
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }

 static int sha512_mb_init(struct ahash_request *areq)
@ -439,11 +458,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 		sha_ctx = (struct sha512_hash_ctx *)
 						ahash_request_ctx(&rctx->areq);
 		kernel_fpu_begin();
-		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
+		sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
 						rctx->walk.data, nbytes, flag);
 		if (!sha_ctx) {
 			if (flush)
-				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
+				sha_ctx = sha512_ctx_mgr_flush(cstate);
 		}
 		kernel_fpu_end();
 		if (sha_ctx)
@ -471,11 +490,12 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	struct sha512_hash_ctx *sha_ctx;
 	struct mcryptd_hash_request_ctx *req_ctx;
 	int ret;
+	unsigned long flags;

 	/* remove from work list */
-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_del(&rctx->waiter);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);

 	if (irqs_disabled())
 		rctx->complete(&req->base, err);
@ -486,14 +506,14 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	}

 	/* check to see if there are other jobs that are done */
-	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	while (sha_ctx) {
 		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
 		ret = sha_finish_walk(&req_ctx, cstate, false);
 		if (req_ctx) {
-			spin_lock(&cstate->work_lock);
+			spin_lock_irqsave(&cstate->work_lock, flags);
 			list_del(&req_ctx->waiter);
-			spin_unlock(&cstate->work_lock);
+			spin_unlock_irqrestore(&cstate->work_lock, flags);

 			req = cast_mcryptd_ctx_to_req(req_ctx);
 			if (irqs_disabled())
@ -504,7 +524,7 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 				local_bh_enable();
 			}
 		}
-		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	}

 	return 0;
@ -515,6 +535,7 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 {
 	unsigned long next_flush;
 	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+	unsigned long flags;

 	/* initialize tag */
 	rctx->tag.arrival = jiffies;    /* tag the arrival time */
@ -522,9 +543,9 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 	next_flush = rctx->tag.arrival + delay;
 	rctx->tag.expire = next_flush;

-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_add_tail(&rctx->waiter, &cstate->work_list);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);

 	mcryptd_arm_flusher(cstate, delay);
 }
@ -565,7 +586,7 @@ static int sha512_mb_update(struct ahash_request *areq)
 	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 							nbytes, HASH_UPDATE);
 	kernel_fpu_end();

@ -628,7 +649,7 @@ static int sha512_mb_finup(struct ahash_request *areq)
 	sha512_mb_add_list(rctx, cstate);

 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 								nbytes, flag);
 	kernel_fpu_end();

@ -677,8 +698,7 @@ static int sha512_mb_final(struct ahash_request *areq)
 	/* flag HASH_FINAL and 0 data size */
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
-								HASH_LAST);
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
 	kernel_fpu_end();

 	/* check if anything is returned */
@ -940,7 +960,7 @@ static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
 			break;
 		kernel_fpu_begin();
 		sha_ctx = (struct sha512_hash_ctx *)
-					sha512_ctx_mgr_flush(cstate->mgr);
+					sha512_ctx_mgr_flush(cstate);
 		kernel_fpu_end();
 		if (!sha_ctx) {
 			pr_err("sha512_mb error: nothing got flushed for"
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@ -280,12 +280,18 @@ ENTRY(sha512_mb_mgr_get_comp_job_avx2)
 	pop     %rbx
        ret
 ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
-.data

-.align 16
+.section	.rodata.cst8.one, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
+
+.section	.rodata.cst8.two, "aM", @progbits, 8
+.align 8
 two:
 .quad  2
+
+.section	.rodata.cst8.three, "aM", @progbits, 8
+.align 8
 three:
 .quad  3
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@ -209,8 +209,9 @@ return_null:
 	xor     job_rax, job_rax
 	jmp     return
 ENDPROC(sha512_mb_mgr_submit_avx2)
-.data

+/* UNUSED?
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 H0:     .int  0x6a09e667
 H1:     .int  0xbb67ae85
@ -220,3 +221,4 @@ H4:     .int  0x510e527f
 H5:     .int  0x9b05688c
 H6:     .int  0x1f83d9ab
 H7:     .int  0x5be0cd19
+*/
--- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@ -361,7 +361,7 @@ Lrounds_16_xx:
 	ret
 ENDPROC(sha512_x4_avx2)

-.data
+.section	.rodata.K512_4, "a", @progbits
 .align 64
 K512_4:
 	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
@ -525,5 +525,7 @@ K512_4:
 	.octa 0x6c44198c4a4758176c44198c4a475817,\
 		0x6c44198c4a4758176c44198c4a475817

+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
                         .octa 0x18191a1b1c1d1e1f1011121314151617
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@ -369,14 +369,17 @@ ENDPROC(sha512_transform_ssse3)
 ########################################################################
 ### Binary Data

-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607

+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@ -29,11 +29,13 @@

 .file "twofish-avx-x86_64-asm_64.S"

-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0

--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@ -263,6 +263,7 @@ comment "Authenticated Encryption with Associated Data"
 config CRYPTO_CCM
 	tristate "CCM support"
 	select CRYPTO_CTR
+	select CRYPTO_HASH
 	select CRYPTO_AEAD
 	help
 	  Support for Counter with CBC MAC. Required for IPsec.
@ -374,6 +375,7 @@ config CRYPTO_XTS
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	select CRYPTO_GF128MUL
+	select CRYPTO_ECB
 	help
 	  XTS: IEEE1619/D16 narrow block cipher use with aes-xts-plain,
 	  key size 256, 384 or 512 bits. This implementation currently
@ -895,6 +897,23 @@ config CRYPTO_AES

 	  See <http://csrc.nist.gov/CryptoToolkit/aes/> for more information.

+config CRYPTO_AES_TI
+	tristate "Fixed time AES cipher"
+	select CRYPTO_ALGAPI
+	help
+	  This is a generic implementation of AES that attempts to eliminate
+	  data dependent latencies as much as possible without affecting
+	  performance too much. It is intended for use by the generic CCM
+	  and GCM drivers, and other CTR or CMAC/XCBC based modes that rely
+	  solely on encryption (although decryption is supported as well, but
+	  with a more dramatic performance hit)
+
+	  Instead of using 16 lookup tables of 1 KB each, (8 for encryption and
+	  8 for decryption), this implementation only uses just two S-boxes of
+	  256 bytes each, and attempts to eliminate data dependent latencies by
+	  prefetching the entire table into the cache at the start of each
+	  block.
+
 config CRYPTO_AES_586
 	tristate "AES cipher algorithms (i586)"
 	depends on (X86 || UML_X86) && !64BIT
--- a/crypto/Makefile
+++ b/crypto/Makefile
@ -75,6 +75,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
+CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
@ -98,7 +99,9 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_COMMON) += blowfish_common.o
 obj-$(CONFIG_CRYPTO_TWOFISH) += twofish_generic.o
 obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
 obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
+CFLAGS_serpent_generic.o := $(call cc-option,-fsched-pressure)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
+obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
 obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
 obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>

 #include <crypto/scatterwalk.h>
@ -394,7 +395,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
@ -468,7 +469,7 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@ -20,6 +20,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/internal/acompress.h>
 #include <crypto/internal/scompress.h>
@ -50,7 +51,7 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;

 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
--- a/crypto/aead.c
+++ b/crypto/aead.c
@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>

 #include "internal.h"
@ -132,7 +133,7 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@ -54,6 +54,7 @@
 #include <linux/errno.h>
 #include <linux/crypto.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>

 static inline u8 byte(const u32 x, const unsigned n)
 {
@ -1216,7 +1217,6 @@ EXPORT_SYMBOL_GPL(crypto_il_tab);
 int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		unsigned int key_len)
 {
-	const __le32 *key = (const __le32 *)in_key;
 	u32 i, t, u, v, w, j;

 	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
@ -1225,10 +1225,15 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,

 	ctx->key_length = key_len;

-	ctx->key_dec[key_len + 24] = ctx->key_enc[0] = le32_to_cpu(key[0]);
-	ctx->key_dec[key_len + 25] = ctx->key_enc[1] = le32_to_cpu(key[1]);
-	ctx->key_dec[key_len + 26] = ctx->key_enc[2] = le32_to_cpu(key[2]);
-	ctx->key_dec[key_len + 27] = ctx->key_enc[3] = le32_to_cpu(key[3]);
+	ctx->key_enc[0] = get_unaligned_le32(in_key);
+	ctx->key_enc[1] = get_unaligned_le32(in_key + 4);
+	ctx->key_enc[2] = get_unaligned_le32(in_key + 8);
+	ctx->key_enc[3] = get_unaligned_le32(in_key + 12);
+
+	ctx->key_dec[key_len + 24] = ctx->key_enc[0];
+	ctx->key_dec[key_len + 25] = ctx->key_enc[1];
+	ctx->key_dec[key_len + 26] = ctx->key_enc[2];
+	ctx->key_dec[key_len + 27] = ctx->key_enc[3];

 	switch (key_len) {
 	case AES_KEYSIZE_128:
@ -1238,17 +1243,17 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		break;

 	case AES_KEYSIZE_192:
-		ctx->key_enc[4] = le32_to_cpu(key[4]);
-		t = ctx->key_enc[5] = le32_to_cpu(key[5]);
+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+		t = ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
 		for (i = 0; i < 8; ++i)
 			loop6(i);
 		break;

 	case AES_KEYSIZE_256:
-		ctx->key_enc[4] = le32_to_cpu(key[4]);
-		ctx->key_enc[5] = le32_to_cpu(key[5]);
-		ctx->key_enc[6] = le32_to_cpu(key[6]);
-		t = ctx->key_enc[7] = le32_to_cpu(key[7]);
+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+		ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
+		ctx->key_enc[6] = get_unaligned_le32(in_key + 24);
+		t = ctx->key_enc[7] = get_unaligned_le32(in_key + 28);
 		for (i = 0; i < 6; ++i)
 			loop8(i);
 		loop8tophalf(i);
@ -1329,16 +1334,14 @@ EXPORT_SYMBOL_GPL(crypto_aes_set_key);
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *src = (const __le32 *)in;
-	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const u32 *kp = ctx->key_enc + 4;
 	const int key_len = ctx->key_length;

-	b0[0] = le32_to_cpu(src[0]) ^ ctx->key_enc[0];
-	b0[1] = le32_to_cpu(src[1]) ^ ctx->key_enc[1];
-	b0[2] = le32_to_cpu(src[2]) ^ ctx->key_enc[2];
-	b0[3] = le32_to_cpu(src[3]) ^ ctx->key_enc[3];
+	b0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
+	b0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
+	b0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
+	b0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);

 	if (key_len > 24) {
 		f_nround(b1, b0, kp);
@ -1361,10 +1364,10 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	f_nround(b1, b0, kp);
 	f_lround(b0, b1, kp);

-	dst[0] = cpu_to_le32(b0[0]);
-	dst[1] = cpu_to_le32(b0[1]);
-	dst[2] = cpu_to_le32(b0[2]);
-	dst[3] = cpu_to_le32(b0[3]);
+	put_unaligned_le32(b0[0], out);
+	put_unaligned_le32(b0[1], out + 4);
+	put_unaligned_le32(b0[2], out + 8);
+	put_unaligned_le32(b0[3], out + 12);
 }

 /* decrypt a block of text */
@ -1401,16 +1404,14 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *src = (const __le32 *)in;
-	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const int key_len = ctx->key_length;
 	const u32 *kp = ctx->key_dec + 4;

-	b0[0] = le32_to_cpu(src[0]) ^  ctx->key_dec[0];
-	b0[1] = le32_to_cpu(src[1]) ^  ctx->key_dec[1];
-	b0[2] = le32_to_cpu(src[2]) ^  ctx->key_dec[2];
-	b0[3] = le32_to_cpu(src[3]) ^  ctx->key_dec[3];
+	b0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
+	b0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
+	b0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
+	b0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);

 	if (key_len > 24) {
 		i_nround(b1, b0, kp);
@ -1433,10 +1434,10 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	i_nround(b1, b0, kp);
 	i_lround(b0, b1, kp);

-	dst[0] = cpu_to_le32(b0[0]);
-	dst[1] = cpu_to_le32(b0[1]);
-	dst[2] = cpu_to_le32(b0[2]);
-	dst[3] = cpu_to_le32(b0[3]);
+	put_unaligned_le32(b0[0], out);
+	put_unaligned_le32(b0[1], out + 4);
+	put_unaligned_le32(b0[2], out + 8);
+	put_unaligned_le32(b0[3], out + 12);
 }

 static struct crypto_alg aes_alg = {
@ -1446,7 +1447,6 @@ static struct crypto_alg aes_alg = {
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
 	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
-	.cra_alignmask		=	3,
 	.cra_module		=	THIS_MODULE,
 	.cra_u			=	{
 		.cipher = {
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@ -0,0 +1,375 @@
+/*
+ * Scalar fixed time AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <asm/unaligned.h>
+
+/*
+ * Emit the sbox as volatile const to prevent the compiler from doing
+ * constant folding on sbox references involving fixed indexes.
+ */
+static volatile const u8 __cacheline_aligned __aesti_sbox[] = {
+	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
+};
+
+static volatile const u8 __cacheline_aligned __aesti_inv_sbox[] = {
+	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
+};
+
+static u32 mul_by_x(u32 w)
+{
+	u32 x = w & 0x7f7f7f7f;
+	u32 y = w & 0x80808080;
+
+	/* multiply by polynomial 'x' (0b10) in GF(2^8) */
+	return (x << 1) ^ (y >> 7) * 0x1b;
+}
+
+static u32 mul_by_x2(u32 w)
+{
+	u32 x = w & 0x3f3f3f3f;
+	u32 y = w & 0x80808080;
+	u32 z = w & 0x40404040;
+
+	/* multiply by polynomial 'x^2' (0b100) in GF(2^8) */
+	return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b;
+}
+
+static u32 mix_columns(u32 x)
+{
+	/*
+	 * Perform the following matrix multiplication in GF(2^8)
+	 *
+	 * | 0x2 0x3 0x1 0x1 |   | x[0] |
+	 * | 0x1 0x2 0x3 0x1 |   | x[1] |
+	 * | 0x1 0x1 0x2 0x3 | x | x[2] |
+	 * | 0x3 0x1 0x1 0x3 |   | x[3] |
+	 */
+	u32 y = mul_by_x(x) ^ ror32(x, 16);
+
+	return y ^ ror32(x ^ y, 8);
+}
+
+static u32 inv_mix_columns(u32 x)
+{
+	/*
+	 * Perform the following matrix multiplication in GF(2^8)
+	 *
+	 * | 0xe 0xb 0xd 0x9 |   | x[0] |
+	 * | 0x9 0xe 0xb 0xd |   | x[1] |
+	 * | 0xd 0x9 0xe 0xb | x | x[2] |
+	 * | 0xb 0xd 0x9 0xe |   | x[3] |
+	 *
+	 * which can conveniently be reduced to
+	 *
+	 * | 0x2 0x3 0x1 0x1 |   | 0x5 0x0 0x4 0x0 |   | x[0] |
+	 * | 0x1 0x2 0x3 0x1 |   | 0x0 0x5 0x0 0x4 |   | x[1] |
+	 * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] |
+	 * | 0x3 0x1 0x1 0x2 |   | 0x0 0x4 0x0 0x5 |   | x[3] |
+	 */
+	u32 y = mul_by_x2(x);
+
+	return mix_columns(x ^ y ^ ror32(y, 16));
+}
+
+static __always_inline u32 subshift(u32 in[], int pos)
+{
+	return (__aesti_sbox[in[pos] & 0xff]) ^
+	       (__aesti_sbox[(in[(pos + 1) % 4] >>  8) & 0xff] <<  8) ^
+	       (__aesti_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
+	       (__aesti_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
+}
+
+static __always_inline u32 inv_subshift(u32 in[], int pos)
+{
+	return (__aesti_inv_sbox[in[pos] & 0xff]) ^
+	       (__aesti_inv_sbox[(in[(pos + 3) % 4] >>  8) & 0xff] <<  8) ^
+	       (__aesti_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
+	       (__aesti_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
+}
+
+static u32 subw(u32 in)
+{
+	return (__aesti_sbox[in & 0xff]) ^
+	       (__aesti_sbox[(in >>  8) & 0xff] <<  8) ^
+	       (__aesti_sbox[(in >> 16) & 0xff] << 16) ^
+	       (__aesti_sbox[(in >> 24) & 0xff] << 24);
+}
+
+static int aesti_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+			    unsigned int key_len)
+{
+	u32 kwords = key_len / sizeof(u32);
+	u32 rc, i, j;
+
+	if (key_len != AES_KEYSIZE_128 &&
+	    key_len != AES_KEYSIZE_192 &&
+	    key_len != AES_KEYSIZE_256)
+		return -EINVAL;
+
+	ctx->key_length = key_len;
+
+	for (i = 0; i < kwords; i++)
+		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
+
+	for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
+		u32 *rki = ctx->key_enc + (i * kwords);
+		u32 *rko = rki + kwords;
+
+		rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
+		rko[1] = rko[0] ^ rki[1];
+		rko[2] = rko[1] ^ rki[2];
+		rko[3] = rko[2] ^ rki[3];
+
+		if (key_len == 24) {
+			if (i >= 7)
+				break;
+			rko[4] = rko[3] ^ rki[4];
+			rko[5] = rko[4] ^ rki[5];
+		} else if (key_len == 32) {
+			if (i >= 6)
+				break;
+			rko[4] = subw(rko[3]) ^ rki[4];
+			rko[5] = rko[4] ^ rki[5];
+			rko[6] = rko[5] ^ rki[6];
+			rko[7] = rko[6] ^ rki[7];
+		}
+	}
+
+	/*
+	 * Generate the decryption keys for the Equivalent Inverse Cipher.
+	 * This involves reversing the order of the round keys, and applying
+	 * the Inverse Mix Columns transformation to all but the first and
+	 * the last one.
+	 */
+	ctx->key_dec[0] = ctx->key_enc[key_len + 24];
+	ctx->key_dec[1] = ctx->key_enc[key_len + 25];
+	ctx->key_dec[2] = ctx->key_enc[key_len + 26];
+	ctx->key_dec[3] = ctx->key_enc[key_len + 27];
+
+	for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
+		ctx->key_dec[i]     = inv_mix_columns(ctx->key_enc[j]);
+		ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
+		ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
+		ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
+	}
+
+	ctx->key_dec[i]     = ctx->key_enc[0];
+	ctx->key_dec[i + 1] = ctx->key_enc[1];
+	ctx->key_dec[i + 2] = ctx->key_enc[2];
+	ctx->key_dec[i + 3] = ctx->key_enc[3];
+
+	return 0;
+}
+
+static int aesti_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			 unsigned int key_len)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = aesti_expand_key(ctx, in_key, key_len);
+	if (err)
+		return err;
+
+	/*
+	 * In order to force the compiler to emit data independent Sbox lookups
+	 * at the start of each block, xor the first round key with values at
+	 * fixed indexes in the Sbox. This will need to be repeated each time
+	 * the key is used, which will pull the entire Sbox into the D-cache
+	 * before any data dependent Sbox lookups are performed.
+	 */
+	ctx->key_enc[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
+	ctx->key_enc[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
+	ctx->key_enc[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
+	ctx->key_enc[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
+
+	ctx->key_dec[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
+	ctx->key_dec[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
+	ctx->key_dec[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
+	ctx->key_dec[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
+
+	return 0;
+}
+
+static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *rkp = ctx->key_enc + 4;
+	int rounds = 6 + ctx->key_length / 4;
+	u32 st0[4], st1[4];
+	int round;
+
+	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
+	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
+	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
+	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
+
+	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
+	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
+	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
+	st0[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
+
+	for (round = 0;; round += 2, rkp += 8) {
+		st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
+		st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
+		st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
+		st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
+
+		if (round == rounds - 2)
+			break;
+
+		st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
+		st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
+		st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
+		st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
+	}
+
+	put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
+	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
+	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
+	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+}
+
+static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *rkp = ctx->key_dec + 4;
+	int rounds = 6 + ctx->key_length / 4;
+	u32 st0[4], st1[4];
+	int round;
+
+	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
+	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
+	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
+	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
+
+	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
+	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
+	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
+	st0[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
+
+	for (round = 0;; round += 2, rkp += 8) {
+		st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
+		st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
+		st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
+		st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
+
+		if (round == rounds - 2)
+			break;
+
+		st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
+		st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
+		st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
+		st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
+	}
+
+	put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
+	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
+	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
+	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-fixed-time",
+	.cra_priority			= 100 + 1,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= aesti_set_key,
+	.cra_cipher.cia_encrypt		= aesti_encrypt,
+	.cra_cipher.cia_decrypt		= aesti_decrypt
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Generic fixed time AES");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>

 #include "internal.h"
@ -493,7 +494,7 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : ahash\n");
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
@ -47,7 +48,7 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;

 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@ -962,34 +962,66 @@ void crypto_inc(u8 *a, unsigned int size)
 	__be32 *b = (__be32 *)(a + size);
 	u32 c;

-	for (; size >= 4; size -= 4) {
-		c = be32_to_cpu(*--b) + 1;
-		*b = cpu_to_be32(c);
-		if (c)
-			return;
-	}
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !((unsigned long)b & (__alignof__(*b) - 1)))
+		for (; size >= 4; size -= 4) {
+			c = be32_to_cpu(*--b) + 1;
+			*b = cpu_to_be32(c);
+			if (c)
+				return;
+		}

 	crypto_inc_byte(a, size);
 }
 EXPORT_SYMBOL_GPL(crypto_inc);

-static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
+void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
 {
-	for (; size; size--)
-		*a++ ^= *b++;
+	int relalign = 0;
+
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+		int size = sizeof(unsigned long);
+		int d = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
+
+		relalign = d ? 1 << __ffs(d) : size;
+
+		/*
+		 * If we care about alignment, process as many bytes as
+		 * needed to advance dst and src to values whose alignments
+		 * equal their relative alignment. This will allow us to
+		 * process the remainder of the input using optimal strides.
+		 */
+		while (((unsigned long)dst & (relalign - 1)) && len > 0) {
+			*dst++ ^= *src++;
+			len--;
+		}
+	}
+
+	while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
+		*(u64 *)dst ^= *(u64 *)src;
+		dst += 8;
+		src += 8;
+		len -= 8;
+	}
+
+	while (len >= 4 && !(relalign & 3)) {
+		*(u32 *)dst ^= *(u32 *)src;
+		dst += 4;
+		src += 4;
+		len -= 4;
+	}
+
+	while (len >= 2 && !(relalign & 1)) {
+		*(u16 *)dst ^= *(u16 *)src;
+		dst += 2;
+		src += 2;
+		len -= 2;
+	}
+
+	while (len--)
+		*dst++ ^= *src++;
 }
-
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
-{
-	u32 *a = (u32 *)dst;
-	u32 *b = (u32 *)src;
-
-	for (; size >= 4; size -= 4)
-		*a++ ^= *b++;
-
-	crypto_xor_byte((u8 *)a, (u8 *)b, size);
-}
-EXPORT_SYMBOL_GPL(crypto_xor);
+EXPORT_SYMBOL_GPL(__crypto_xor);

 unsigned int crypto_alg_extsize(struct crypto_alg *alg)
 {
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@ -245,7 +245,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 	struct ahash_request *req = &ctx->req;
-	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req))];
+	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req)) ? : 1];
 	struct sock *sk2;
 	struct alg_sock *ask2;
 	struct hash_ctx *ctx2;
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@ -1,6 +1,6 @@
 /*
 * Block chaining cipher operations.
- * 
+ *
 * Generic encrypt/decrypt wrapper for ciphers, handles operations across
 * multiple page boundaries by using temporary blocks.  In user context,
 * the kernel is given a chance to schedule us once per page.
@ -9,7 +9,7 @@
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) 
+ * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 *
 */
@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>

 #include "internal.h"
@ -534,7 +535,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : blkcipher\n");
--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@ -145,9 +145,6 @@ static int crypto_cbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->cra_alignmask;

-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->cra_blocksize;
 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@ -11,6 +11,7 @@
 */

 #include <crypto/internal/aead.h>
+#include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@ -23,11 +24,11 @@

 struct ccm_instance_ctx {
 	struct crypto_skcipher_spawn ctr;
-	struct crypto_spawn cipher;
+	struct crypto_ahash_spawn mac;
 };

 struct crypto_ccm_ctx {
-	struct crypto_cipher *cipher;
+	struct crypto_ahash *mac;
 	struct crypto_skcipher *ctr;
 };

@ -44,15 +45,21 @@ struct crypto_rfc4309_req_ctx {

 struct crypto_ccm_req_priv_ctx {
 	u8 odata[16];
-	u8 idata[16];
 	u8 auth_tag[16];
-	u32 ilen;
 	u32 flags;
 	struct scatterlist src[3];
 	struct scatterlist dst[3];
 	struct skcipher_request skreq;
 };

+struct cbcmac_tfm_ctx {
+	struct crypto_cipher *child;
+};
+
+struct cbcmac_desc_ctx {
+	unsigned int len;
+};
+
 static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx(
 	struct aead_request *req)
 {
@ -84,7 +91,7 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 {
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
 	struct crypto_skcipher *ctr = ctx->ctr;
-	struct crypto_cipher *tfm = ctx->cipher;
+	struct crypto_ahash *mac = ctx->mac;
 	int err = 0;

 	crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
@ -96,11 +103,11 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 	if (err)
 		goto out;

-	crypto_cipher_clear_flags(tfm, CRYPTO_TFM_REQ_MASK);
-	crypto_cipher_set_flags(tfm, crypto_aead_get_flags(aead) &
+	crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK);
+	crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) &
 				    CRYPTO_TFM_REQ_MASK);
-	err = crypto_cipher_setkey(tfm, key, keylen);
-	crypto_aead_set_flags(aead, crypto_cipher_get_flags(tfm) &
+	err = crypto_ahash_setkey(mac, key, keylen);
+	crypto_aead_set_flags(aead, crypto_ahash_get_flags(mac) &
 			      CRYPTO_TFM_RES_MASK);

 out:
@ -167,119 +174,61 @@ static int format_adata(u8 *adata, unsigned int a)
 	return len;
 }

-static void compute_mac(struct crypto_cipher *tfm, u8 *data, int n,
-		       struct crypto_ccm_req_priv_ctx *pctx)
-{
-	unsigned int bs = 16;
-	u8 *odata = pctx->odata;
-	u8 *idata = pctx->idata;
-	int datalen, getlen;
-
-	datalen = n;
-
-	/* first time in here, block may be partially filled. */
-	getlen = bs - pctx->ilen;
-	if (datalen >= getlen) {
-		memcpy(idata + pctx->ilen, data, getlen);
-		crypto_xor(odata, idata, bs);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-		datalen -= getlen;
-		data += getlen;
-		pctx->ilen = 0;
-	}
-
-	/* now encrypt rest of data */
-	while (datalen >= bs) {
-		crypto_xor(odata, data, bs);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-
-		datalen -= bs;
-		data += bs;
-	}
-
-	/* check and see if there's leftover data that wasn't
-	 * enough to fill a block.
-	 */
-	if (datalen) {
-		memcpy(idata + pctx->ilen, data, datalen);
-		pctx->ilen += datalen;
-	}
-}
-
-static void get_data_to_compute(struct crypto_cipher *tfm,
-			       struct crypto_ccm_req_priv_ctx *pctx,
-			       struct scatterlist *sg, unsigned int len)
-{
-	struct scatter_walk walk;
-	u8 *data_src;
-	int n;
-
-	scatterwalk_start(&walk, sg);
-
-	while (len) {
-		n = scatterwalk_clamp(&walk, len);
-		if (!n) {
-			scatterwalk_start(&walk, sg_next(walk.sg));
-			n = scatterwalk_clamp(&walk, len);
-		}
-		data_src = scatterwalk_map(&walk);
-
-		compute_mac(tfm, data_src, n, pctx);
-		len -= n;
-
-		scatterwalk_unmap(data_src);
-		scatterwalk_advance(&walk, n);
-		scatterwalk_done(&walk, 0, len);
-		if (len)
-			crypto_yield(pctx->flags);
-	}
-
-	/* any leftover needs padding and then encrypted */
-	if (pctx->ilen) {
-		int padlen;
-		u8 *odata = pctx->odata;
-		u8 *idata = pctx->idata;
-
-		padlen = 16 - pctx->ilen;
-		memset(idata + pctx->ilen, 0, padlen);
-		crypto_xor(odata, idata, 16);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-		pctx->ilen = 0;
-	}
-}
-
 static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain,
 			   unsigned int cryptlen)
 {
+	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
-	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
-	struct crypto_cipher *cipher = ctx->cipher;
+	AHASH_REQUEST_ON_STACK(ahreq, ctx->mac);
 	unsigned int assoclen = req->assoclen;
-	u8 *odata = pctx->odata;
-	u8 *idata = pctx->idata;
-	int err;
+	struct scatterlist sg[3];
+	u8 odata[16];
+	u8 idata[16];
+	int ilen, err;

 	/* format control data for input */
 	err = format_input(odata, req, cryptlen);
 	if (err)
 		goto out;

-	/* encrypt first block to use as start in computing mac  */
-	crypto_cipher_encrypt_one(cipher, odata, odata);
+	sg_init_table(sg, 3);
+	sg_set_buf(&sg[0], odata, 16);

 	/* format associated data and compute into mac */
 	if (assoclen) {
-		pctx->ilen = format_adata(idata, assoclen);
-		get_data_to_compute(cipher, pctx, req->src, req->assoclen);
+		ilen = format_adata(idata, assoclen);
+		sg_set_buf(&sg[1], idata, ilen);
+		sg_chain(sg, 3, req->src);
 	} else {
-		pctx->ilen = 0;
+		ilen = 0;
+		sg_chain(sg, 2, req->src);
 	}

-	/* compute plaintext into mac */
-	if (cryptlen)
-		get_data_to_compute(cipher, pctx, plain, cryptlen);
+	ahash_request_set_tfm(ahreq, ctx->mac);
+	ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL);
+	ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16);
+	err = crypto_ahash_init(ahreq);
+	if (err)
+		goto out;
+	err = crypto_ahash_update(ahreq);
+	if (err)
+		goto out;

+	/* we need to pad the MAC input to a round multiple of the block size */
+	ilen = 16 - (assoclen + ilen) % 16;
+	if (ilen < 16) {
+		memset(idata, 0, ilen);
+		sg_init_table(sg, 2);
+		sg_set_buf(&sg[0], idata, ilen);
+		if (plain)
+			sg_chain(sg, 2, plain);
+		plain = sg;
+		cryptlen += ilen;
+	}
+
+	ahash_request_set_crypt(ahreq, plain, pctx->odata, cryptlen);
+	err = crypto_ahash_finup(ahreq);
 out:
 	return err;
 }
@ -453,21 +402,21 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
 	struct aead_instance *inst = aead_alg_instance(tfm);
 	struct ccm_instance_ctx *ictx = aead_instance_ctx(inst);
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
-	struct crypto_cipher *cipher;
+	struct crypto_ahash *mac;
 	struct crypto_skcipher *ctr;
 	unsigned long align;
 	int err;

-	cipher = crypto_spawn_cipher(&ictx->cipher);
-	if (IS_ERR(cipher))
-		return PTR_ERR(cipher);
+	mac = crypto_spawn_ahash(&ictx->mac);
+	if (IS_ERR(mac))
+		return PTR_ERR(mac);

 	ctr = crypto_spawn_skcipher(&ictx->ctr);
 	err = PTR_ERR(ctr);
 	if (IS_ERR(ctr))
-		goto err_free_cipher;
+		goto err_free_mac;

-	ctx->cipher = cipher;
+	ctx->mac = mac;
 	ctx->ctr = ctr;

 	align = crypto_aead_alignmask(tfm);
@ -479,8 +428,8 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)

 	return 0;

-err_free_cipher:
-	crypto_free_cipher(cipher);
+err_free_mac:
+	crypto_free_ahash(mac);
 	return err;
 }

@ -488,7 +437,7 @@ static void crypto_ccm_exit_tfm(struct crypto_aead *tfm)
 {
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);

-	crypto_free_cipher(ctx->cipher);
+	crypto_free_ahash(ctx->mac);
 	crypto_free_skcipher(ctx->ctr);
 }

@ -496,7 +445,7 @@ static void crypto_ccm_free(struct aead_instance *inst)
 {
 	struct ccm_instance_ctx *ctx = aead_instance_ctx(inst);

-	crypto_drop_spawn(&ctx->cipher);
+	crypto_drop_ahash(&ctx->mac);
 	crypto_drop_skcipher(&ctx->ctr);
 	kfree(inst);
 }
@ -505,12 +454,13 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 				    struct rtattr **tb,
 				    const char *full_name,
 				    const char *ctr_name,
-				    const char *cipher_name)
+				    const char *mac_name)
 {
 	struct crypto_attr_type *algt;
 	struct aead_instance *inst;
 	struct skcipher_alg *ctr;
-	struct crypto_alg *cipher;
+	struct crypto_alg *mac_alg;
+	struct hash_alg_common *mac;
 	struct ccm_instance_ctx *ictx;
 	int err;

@ -521,25 +471,26 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;

-	cipher = crypto_alg_mod_lookup(cipher_name,  CRYPTO_ALG_TYPE_CIPHER,
-				       CRYPTO_ALG_TYPE_MASK);
-	if (IS_ERR(cipher))
-		return PTR_ERR(cipher);
+	mac_alg = crypto_find_alg(mac_name, &crypto_ahash_type,
+				  CRYPTO_ALG_TYPE_HASH,
+				  CRYPTO_ALG_TYPE_AHASH_MASK |
+				  CRYPTO_ALG_ASYNC);
+	if (IS_ERR(mac_alg))
+		return PTR_ERR(mac_alg);

+	mac = __crypto_hash_alg_common(mac_alg);
 	err = -EINVAL;
-	if (cipher->cra_blocksize != 16)
-		goto out_put_cipher;
+	if (mac->digestsize != 16)
+		goto out_put_mac;

 	inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
 	err = -ENOMEM;
 	if (!inst)
-		goto out_put_cipher;
+		goto out_put_mac;

 	ictx = aead_instance_ctx(inst);
-
-	err = crypto_init_spawn(&ictx->cipher, cipher,
-				aead_crypto_instance(inst),
-				CRYPTO_ALG_TYPE_MASK);
+	err = crypto_init_ahash_spawn(&ictx->mac, mac,
+				      aead_crypto_instance(inst));
 	if (err)
 		goto err_free_inst;

@ -548,7 +499,7 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 				   crypto_requires_sync(algt->type,
 							algt->mask));
 	if (err)
-		goto err_drop_cipher;
+		goto err_drop_mac;

 	ctr = crypto_spawn_skcipher_alg(&ictx->ctr);

@ -564,18 +515,17 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
 		     "ccm_base(%s,%s)", ctr->base.cra_driver_name,
-		     cipher->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		     mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_drop_ctr;

 	memcpy(inst->alg.base.cra_name, full_name, CRYPTO_MAX_ALG_NAME);

 	inst->alg.base.cra_flags = ctr->base.cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_priority = (cipher->cra_priority +
+	inst->alg.base.cra_priority = (mac->base.cra_priority +
 				       ctr->base.cra_priority) / 2;
 	inst->alg.base.cra_blocksize = 1;
-	inst->alg.base.cra_alignmask = cipher->cra_alignmask |
-				       ctr->base.cra_alignmask |
-				       (__alignof__(u32) - 1);
+	inst->alg.base.cra_alignmask = mac->base.cra_alignmask |
+				       ctr->base.cra_alignmask;
 	inst->alg.ivsize = 16;
 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(ctr);
 	inst->alg.maxauthsize = 16;
@ -593,23 +543,24 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	if (err)
 		goto err_drop_ctr;

-out_put_cipher:
-	crypto_mod_put(cipher);
+out_put_mac:
+	crypto_mod_put(mac_alg);
 	return err;

 err_drop_ctr:
 	crypto_drop_skcipher(&ictx->ctr);
-err_drop_cipher:
-	crypto_drop_spawn(&ictx->cipher);
+err_drop_mac:
+	crypto_drop_ahash(&ictx->mac);
 err_free_inst:
 	kfree(inst);
-	goto out_put_cipher;
+	goto out_put_mac;
 }

 static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
 {
 	const char *cipher_name;
 	char ctr_name[CRYPTO_MAX_ALG_NAME];
+	char mac_name[CRYPTO_MAX_ALG_NAME];
 	char full_name[CRYPTO_MAX_ALG_NAME];

 	cipher_name = crypto_attr_alg_name(tb[1]);
@ -620,12 +571,16 @@ static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
 		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;

+	if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)",
+		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
 	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "ccm(%s)", cipher_name) >=
 	    CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;

 	return crypto_ccm_create_common(tmpl, tb, full_name, ctr_name,
-					cipher_name);
+					mac_name);
 }

 static struct crypto_template crypto_ccm_tmpl = {
@ -899,14 +854,164 @@ static struct crypto_template crypto_rfc4309_tmpl = {
 	.module = THIS_MODULE,
 };

+static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent,
+				     const u8 *inkey, unsigned int keylen)
+{
+	struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
+
+	return crypto_cipher_setkey(ctx->child, inkey, keylen);
+}
+
+static int crypto_cbcmac_digest_init(struct shash_desc *pdesc)
+{
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	int bs = crypto_shash_digestsize(pdesc->tfm);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(pdesc->tfm) - bs;
+
+	ctx->len = 0;
+	memset(dg, 0, bs);
+
+	return 0;
+}
+
+static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p,
+				       unsigned int len)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_digestsize(parent);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
+
+	while (len > 0) {
+		unsigned int l = min(len, bs - ctx->len);
+
+		crypto_xor(dg + ctx->len, p, l);
+		ctx->len +=l;
+		len -= l;
+		p += l;
+
+		if (ctx->len == bs) {
+			crypto_cipher_encrypt_one(tfm, dg, dg);
+			ctx->len = 0;
+		}
+	}
+
+	return 0;
+}
+
+static int crypto_cbcmac_digest_final(struct shash_desc *pdesc, u8 *out)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_digestsize(parent);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
+
+	if (ctx->len)
+		crypto_cipher_encrypt_one(tfm, dg, dg);
+
+	memcpy(out, dg, bs);
+	return 0;
+}
+
+static int cbcmac_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_cipher *cipher;
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cipher = crypto_spawn_cipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+
+	return 0;
+};
+
+static void cbcmac_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->child);
+}
+
+static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct shash_instance *inst;
+	struct crypto_alg *alg;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
+	if (err)
+		return err;
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	inst = shash_alloc_instance("cbcmac", alg);
+	err = PTR_ERR(inst);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
+				shash_crypto_instance(inst),
+				CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_free_inst;
+
+	inst->alg.base.cra_priority = alg->cra_priority;
+	inst->alg.base.cra_blocksize = 1;
+
+	inst->alg.digestsize = alg->cra_blocksize;
+	inst->alg.descsize = ALIGN(sizeof(struct cbcmac_desc_ctx),
+				   alg->cra_alignmask + 1) +
+			     alg->cra_blocksize;
+
+	inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx);
+	inst->alg.base.cra_init = cbcmac_init_tfm;
+	inst->alg.base.cra_exit = cbcmac_exit_tfm;
+
+	inst->alg.init = crypto_cbcmac_digest_init;
+	inst->alg.update = crypto_cbcmac_digest_update;
+	inst->alg.final = crypto_cbcmac_digest_final;
+	inst->alg.setkey = crypto_cbcmac_digest_setkey;
+
+	err = shash_register_instance(tmpl, inst);
+
+out_free_inst:
+	if (err)
+		shash_free_instance(shash_crypto_instance(inst));
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
+static struct crypto_template crypto_cbcmac_tmpl = {
+	.name = "cbcmac",
+	.create = cbcmac_create,
+	.free = shash_free_instance,
+	.module = THIS_MODULE,
+};
+
 static int __init crypto_ccm_module_init(void)
 {
 	int err;

-	err = crypto_register_template(&crypto_ccm_base_tmpl);
+	err = crypto_register_template(&crypto_cbcmac_tmpl);
 	if (err)
 		goto out;

+	err = crypto_register_template(&crypto_ccm_base_tmpl);
+	if (err)
+		goto out_undo_cbcmac;
+
 	err = crypto_register_template(&crypto_ccm_tmpl);
 	if (err)
 		goto out_undo_base;
@ -922,6 +1027,8 @@ static int __init crypto_ccm_module_init(void)
 	crypto_unregister_template(&crypto_ccm_tmpl);
 out_undo_base:
 	crypto_unregister_template(&crypto_ccm_base_tmpl);
+out_undo_cbcmac:
+	crypto_register_template(&crypto_cbcmac_tmpl);
 	goto out;
 }

@ -930,6 +1037,7 @@ static void __exit crypto_ccm_module_exit(void)
 	crypto_unregister_template(&crypto_rfc4309_tmpl);
 	crypto_unregister_template(&crypto_ccm_tmpl);
 	crypto_unregister_template(&crypto_ccm_base_tmpl);
+	crypto_unregister_template(&crypto_cbcmac_tmpl);
 }

 module_init(crypto_ccm_module_init);
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@ -10,10 +10,9 @@
 */

 #include <crypto/algapi.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
 #include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/module.h>

 static inline u32 le32_to_cpuvp(const void *p)
 {
@ -63,10 +62,10 @@ void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_init);

-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize)
 {
-	struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int i;

 	if (keysize != CHACHA20_KEY_SIZE)
@ -79,66 +78,54 @@ int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);

-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-			  struct scatterlist *src, unsigned int nbytes)
+int crypto_chacha20_crypt(struct skcipher_request *req)
 {
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	u32 state[16];
 	int err;

-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, true);

-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+	crypto_chacha20_init(state, ctx, walk.iv);

-	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-				 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = blkcipher_walk_done(desc, &walk,
-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
-	}
-
-	if (walk.nbytes) {
+	while (walk.nbytes > 0) {
 		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
 				 walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		err = skcipher_walk_done(&walk, 0);
 	}

 	return err;
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);

-static struct crypto_alg alg = {
-	.cra_name		= "chacha20",
-	.cra_driver_name	= "chacha20-generic",
-	.cra_priority		= 100,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
-	.cra_alignmask		= sizeof(u32) - 1,
-	.cra_module		= THIS_MODULE,
-	.cra_u			= {
-		.blkcipher = {
-			.min_keysize	= CHACHA20_KEY_SIZE,
-			.max_keysize	= CHACHA20_KEY_SIZE,
-			.ivsize		= CHACHA20_IV_SIZE,
-			.geniv		= "seqiv",
-			.setkey		= crypto_chacha20_setkey,
-			.encrypt	= crypto_chacha20_crypt,
-			.decrypt	= crypto_chacha20_crypt,
-		},
-	},
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-generic",
+	.base.cra_priority	= 100,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_alignmask	= sizeof(u32) - 1,
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= crypto_chacha20_crypt,
+	.decrypt		= crypto_chacha20_crypt,
 };

 static int __init chacha20_generic_mod_init(void)
 {
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }

 static void __exit chacha20_generic_mod_fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }

 module_init(chacha20_generic_mod_init);
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@ -260,8 +260,7 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (err)
 		goto out_free_inst;

-	/* We access the data as u32s when xoring. */
-	alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
+	alignmask = alg->cra_alignmask;
 	inst->alg.base.cra_alignmask = alignmask;
 	inst->alg.base.cra_priority = alg->cra_priority;
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@ -209,7 +209,7 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER;
 	inst->alg.cra_priority = alg->cra_priority;
 	inst->alg.cra_blocksize = 1;
-	inst->alg.cra_alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
+	inst->alg.cra_alignmask = alg->cra_alignmask;
 	inst->alg.cra_type = &crypto_blkcipher_type;

 	inst->alg.cra_blkcipher.ivsize = alg->cra_blocksize;
--- a/crypto/cts.c
+++ b/crypto/cts.c
@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <crypto/scatterwalk.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>

 struct crypto_cts_ctx {
 	struct crypto_skcipher *child;
@ -103,7 +104,7 @@ static int cts_cbc_encrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	int lastn;
@ -183,7 +184,7 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	u8 *space;
@ -373,9 +374,6 @@ static int crypto_cts_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;

-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->base.cra_blocksize;
 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(alg);
 	inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@ -19,6 +19,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/kpp.h>
 #include <crypto/internal/kpp.h>
@ -47,7 +48,7 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;

 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
 {
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>

 struct crypto_pcbc_ctx {
 	struct crypto_cipher *child;
@ -146,7 +147,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
 	u8 *iv = walk->iv;
-	u8 tmpbuf[bsize] __attribute__ ((aligned(__alignof__(u32))));
+	u8 tmpbuf[bsize] __aligned(__alignof__(u32));

 	do {
 		memcpy(tmpbuf, src, bsize);
@ -259,9 +260,6 @@ static int crypto_pcbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->cra_alignmask;

-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->cra_blocksize;
 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
--- a/crypto/rng.c
+++ b/crypto/rng.c
@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>

 #include "internal.h"
@ -95,7 +96,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : rng\n");
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <linux/vmalloc.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
@ -57,7 +58,7 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;

 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
@ -153,8 +153,6 @@ static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (IS_ERR(inst))
 		return PTR_ERR(inst);

-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	spawn = aead_instance_ctx(inst);
 	alg = crypto_spawn_aead_alg(spawn);

--- a/crypto/shash.c
+++ b/crypto/shash.c
@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
+#include <linux/compiler.h>

 #include "internal.h"

@ -67,7 +68,7 @@ EXPORT_SYMBOL_GPL(crypto_shash_setkey);
 static inline unsigned int shash_align_buffer_size(unsigned len,
 						   unsigned long mask)
 {
-	typedef u8 __attribute__ ((aligned)) u8_aligned;
+	typedef u8 __aligned_largest u8_aligned;
 	return len + (mask & ~(__alignof__(u8_aligned) - 1));
 }

@ -80,7 +81,7 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
 	unsigned int unaligned_len = alignmask + 1 -
 				     ((unsigned long)data & alignmask);
 	u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;

@ -116,7 +117,7 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned int ds = crypto_shash_digestsize(tfm);
 	u8 ubuf[shash_align_buffer_size(ds, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;

@ -403,7 +404,7 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif

 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct shash_alg *salg = __crypto_shash_alg(alg);
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@ -19,6 +19,7 @@
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
@ -185,12 +186,12 @@ void skcipher_walk_complete(struct skcipher_walk *walk, int err)
 		data = p->data;
 		if (!data) {
 			data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1);
-			data = skcipher_get_spot(data, walk->chunksize);
+			data = skcipher_get_spot(data, walk->stride);
 		}

 		scatterwalk_copychunks(data, &p->dst, p->len, 1);

-		if (offset_in_page(p->data) + p->len + walk->chunksize >
+		if (offset_in_page(p->data) + p->len + walk->stride >
 		    PAGE_SIZE)
 			free_page((unsigned long)p->data);

@ -299,7 +300,7 @@ static int skcipher_next_copy(struct skcipher_walk *walk)
 	p->len = walk->nbytes;
 	skcipher_queue_write(walk, p);

-	if (offset_in_page(walk->page) + walk->nbytes + walk->chunksize >
+	if (offset_in_page(walk->page) + walk->nbytes + walk->stride >
 	    PAGE_SIZE)
 		walk->page = NULL;
 	else
@ -344,7 +345,7 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
 			 SKCIPHER_WALK_DIFF);

 	n = walk->total;
-	bsize = min(walk->chunksize, max(n, walk->blocksize));
+	bsize = min(walk->stride, max(n, walk->blocksize));
 	n = scatterwalk_clamp(&walk->in, n);
 	n = scatterwalk_clamp(&walk->out, n);

@ -393,7 +394,7 @@ static int skcipher_copy_iv(struct skcipher_walk *walk)
 	unsigned a = crypto_tfm_ctx_alignment() - 1;
 	unsigned alignmask = walk->alignmask;
 	unsigned ivsize = walk->ivsize;
-	unsigned bs = walk->chunksize;
+	unsigned bs = walk->stride;
 	unsigned aligned_bs;
 	unsigned size;
 	u8 *iv;
@ -463,7 +464,7 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 		       SKCIPHER_WALK_SLEEP : 0;

 	walk->blocksize = crypto_skcipher_blocksize(tfm);
-	walk->chunksize = crypto_skcipher_chunksize(tfm);
+	walk->stride = crypto_skcipher_walksize(tfm);
 	walk->ivsize = crypto_skcipher_ivsize(tfm);
 	walk->alignmask = crypto_skcipher_alignmask(tfm);

@ -525,7 +526,7 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
 		walk->flags &= ~SKCIPHER_WALK_SLEEP;

 	walk->blocksize = crypto_aead_blocksize(tfm);
-	walk->chunksize = crypto_aead_chunksize(tfm);
+	walk->stride = crypto_aead_chunksize(tfm);
 	walk->ivsize = crypto_aead_ivsize(tfm);
 	walk->alignmask = crypto_aead_alignmask(tfm);

@ -807,7 +808,7 @@ static void crypto_skcipher_free_instance(struct crypto_instance *inst)
 }

 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
@ -821,6 +822,7 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "max keysize  : %u\n", skcipher->max_keysize);
 	seq_printf(m, "ivsize       : %u\n", skcipher->ivsize);
 	seq_printf(m, "chunksize    : %u\n", skcipher->chunksize);
+	seq_printf(m, "walksize     : %u\n", skcipher->walksize);
 }

 #ifdef CONFIG_NET
@ -893,11 +895,14 @@ static int skcipher_prepare_alg(struct skcipher_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;

-	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8)
+	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
+	    alg->walksize > PAGE_SIZE / 8)
 		return -EINVAL;

 	if (!alg->chunksize)
 		alg->chunksize = base->cra_blocksize;
+	if (!alg->walksize)
+		alg->walksize = alg->chunksize;

 	base->cra_type = &crypto_skcipher_type2;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@ -22,6 +22,8 @@
 *
 */

+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <crypto/aead.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
@ -1010,6 +1012,8 @@ static inline int tcrypt_test(const char *alg)
 {
 	int ret;

+	pr_debug("testing %s\n", alg);
+
 	ret = alg_test(alg, alg, 0, 0);
 	/* non-fips algs return -EINVAL in fips mode */
 	if (fips_enabled && ret == -EINVAL)
@ -2059,6 +2063,8 @@ static int __init tcrypt_mod_init(void)
 	if (err) {
 		printk(KERN_ERR "tcrypt: one or more tests failed!\n");
 		goto err_free_tv;
+	} else {
+		pr_debug("all tests passed\n");
 	}

 	/* We intentionaly return -EAGAIN to prevent keeping the module,
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
--- a/Show More
+++ b/Show More