aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2021-02-19 14:29:04 +0100
committerDavid Bauer <mail@david-bauer.net>2021-02-26 20:41:01 +0100
commit3888fa78802354ab7bbd19b7d061fd80a16ce06b (patch)
tree2225a6313cb6482f0cb9c09df662a0d44197350e /target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
parent7d4143234c4dfdd050ebc64ec8231f9d81ea65af (diff)
downloadupstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.gz
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.bz2
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.zip
kernel: 5.4: import wireguard backport
Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch691
1 files changed, 691 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch b/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
new file mode 100644
index 0000000000..7f907f2364
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
@@ -0,0 +1,691 @@
+From a92bd97c758d32511f0deeef84f25c3a1d5e7879 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:14 +0100
+Subject: [PATCH 008/124] crypto: arm/chacha - remove dependency on generic
+ ChaCha driver
+
+commit b36d8c09e710c71f6a9690b6586fea2d1c9e1e27 upstream.
+
+Instead of falling back to the generic ChaCha skcipher driver for
+non-SIMD cases, use a fast scalar implementation for ARM authored
+by Eric Biggers. This removes the module dependency on chacha-generic
+altogether, which also simplifies things when we expose the ChaCha
+library interface from this module.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig | 4 +-
+ arch/arm/crypto/Makefile | 3 +-
+ arch/arm/crypto/chacha-glue.c | 304 +++++++++++++++++++++++++++
+ arch/arm/crypto/chacha-neon-glue.c | 202 ------------------
+ arch/arm/crypto/chacha-scalar-core.S | 65 +++---
+ arch/arm64/crypto/chacha-neon-glue.c | 2 +-
+ 6 files changed, 340 insertions(+), 240 deletions(-)
+ create mode 100644 arch/arm/crypto/chacha-glue.c
+ delete mode 100644 arch/arm/crypto/chacha-neon-glue.c
+
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -127,10 +127,8 @@ config CRYPTO_CRC32_ARM_CE
+ select CRYPTO_HASH
+
+ config CRYPTO_CHACHA20_NEON
+- tristate "NEON accelerated ChaCha stream cipher algorithms"
+- depends on KERNEL_MODE_NEON
++ tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
+ select CRYPTO_BLKCIPHER
+- select CRYPTO_CHACHA20
+
+ config CRYPTO_NHPOLY1305_NEON
+ tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -53,7 +53,8 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glu
+ ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+ crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
+ crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
++chacha-neon-y := chacha-scalar-core.o chacha-glue.o
++chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+
+ ifdef REGENERATE_ARM_CRYPTO
+--- /dev/null
++++ b/arch/arm/crypto/chacha-glue.c
+@@ -0,0 +1,304 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
++ * including ChaCha20 (RFC7539)
++ *
++ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
++ * Copyright (C) 2015 Martin Willi
++ */
++
++#include <crypto/algapi.h>
++#include <crypto/internal/chacha.h>
++#include <crypto/internal/simd.h>
++#include <crypto/internal/skcipher.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <asm/cputype.h>
++#include <asm/hwcap.h>
++#include <asm/neon.h>
++#include <asm/simd.h>
++
++asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
++ int nrounds);
++asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
++ int nrounds);
++asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
++asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
++
++asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
++ const u32 *state, int nrounds);
++
++static inline bool neon_usable(void)
++{
++ return crypto_simd_usable();
++}
++
++static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
++ unsigned int bytes, int nrounds)
++{
++ u8 buf[CHACHA_BLOCK_SIZE];
++
++ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
++ chacha_4block_xor_neon(state, dst, src, nrounds);
++ bytes -= CHACHA_BLOCK_SIZE * 4;
++ src += CHACHA_BLOCK_SIZE * 4;
++ dst += CHACHA_BLOCK_SIZE * 4;
++ state[12] += 4;
++ }
++ while (bytes >= CHACHA_BLOCK_SIZE) {
++ chacha_block_xor_neon(state, dst, src, nrounds);
++ bytes -= CHACHA_BLOCK_SIZE;
++ src += CHACHA_BLOCK_SIZE;
++ dst += CHACHA_BLOCK_SIZE;
++ state[12]++;
++ }
++ if (bytes) {
++ memcpy(buf, src, bytes);
++ chacha_block_xor_neon(state, buf, buf, nrounds);
++ memcpy(dst, buf, bytes);
++ }
++}
++
++static int chacha_stream_xor(struct skcipher_request *req,
++ const struct chacha_ctx *ctx, const u8 *iv,
++ bool neon)
++{
++ struct skcipher_walk walk;
++ u32 state[16];
++ int err;
++
++ err = skcipher_walk_virt(&walk, req, false);
++
++ chacha_init_generic(state, ctx->key, iv);
++
++ while (walk.nbytes > 0) {
++ unsigned int nbytes = walk.nbytes;
++
++ if (nbytes < walk.total)
++ nbytes = round_down(nbytes, walk.stride);
++
++ if (!neon) {
++ chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
++ nbytes, state, ctx->nrounds);
++ state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
++ } else {
++ kernel_neon_begin();
++ chacha_doneon(state, walk.dst.virt.addr,
++ walk.src.virt.addr, nbytes, ctx->nrounds);
++ kernel_neon_end();
++ }
++ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
++ }
++
++ return err;
++}
++
++static int do_chacha(struct skcipher_request *req, bool neon)
++{
++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++
++ return chacha_stream_xor(req, ctx, req->iv, neon);
++}
++
++static int chacha_arm(struct skcipher_request *req)
++{
++ return do_chacha(req, false);
++}
++
++static int chacha_neon(struct skcipher_request *req)
++{
++ return do_chacha(req, neon_usable());
++}
++
++static int do_xchacha(struct skcipher_request *req, bool neon)
++{
++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
++ struct chacha_ctx subctx;
++ u32 state[16];
++ u8 real_iv[16];
++
++ chacha_init_generic(state, ctx->key, req->iv);
++
++ if (!neon) {
++ hchacha_block_arm(state, subctx.key, ctx->nrounds);
++ } else {
++ kernel_neon_begin();
++ hchacha_block_neon(state, subctx.key, ctx->nrounds);
++ kernel_neon_end();
++ }
++ subctx.nrounds = ctx->nrounds;
++
++ memcpy(&real_iv[0], req->iv + 24, 8);
++ memcpy(&real_iv[8], req->iv + 16, 8);
++ return chacha_stream_xor(req, &subctx, real_iv, neon);
++}
++
++static int xchacha_arm(struct skcipher_request *req)
++{
++ return do_xchacha(req, false);
++}
++
++static int xchacha_neon(struct skcipher_request *req)
++{
++ return do_xchacha(req, neon_usable());
++}
++
++static struct skcipher_alg arm_algs[] = {
++ {
++ .base.cra_name = "chacha20",
++ .base.cra_driver_name = "chacha20-arm",
++ .base.cra_priority = 200,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = CHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .setkey = chacha20_setkey,
++ .encrypt = chacha_arm,
++ .decrypt = chacha_arm,
++ }, {
++ .base.cra_name = "xchacha20",
++ .base.cra_driver_name = "xchacha20-arm",
++ .base.cra_priority = 200,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = XCHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .setkey = chacha20_setkey,
++ .encrypt = xchacha_arm,
++ .decrypt = xchacha_arm,
++ }, {
++ .base.cra_name = "xchacha12",
++ .base.cra_driver_name = "xchacha12-arm",
++ .base.cra_priority = 200,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = XCHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .setkey = chacha12_setkey,
++ .encrypt = xchacha_arm,
++ .decrypt = xchacha_arm,
++ },
++};
++
++static struct skcipher_alg neon_algs[] = {
++ {
++ .base.cra_name = "chacha20",
++ .base.cra_driver_name = "chacha20-neon",
++ .base.cra_priority = 300,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = CHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .walksize = 4 * CHACHA_BLOCK_SIZE,
++ .setkey = chacha20_setkey,
++ .encrypt = chacha_neon,
++ .decrypt = chacha_neon,
++ }, {
++ .base.cra_name = "xchacha20",
++ .base.cra_driver_name = "xchacha20-neon",
++ .base.cra_priority = 300,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = XCHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .walksize = 4 * CHACHA_BLOCK_SIZE,
++ .setkey = chacha20_setkey,
++ .encrypt = xchacha_neon,
++ .decrypt = xchacha_neon,
++ }, {
++ .base.cra_name = "xchacha12",
++ .base.cra_driver_name = "xchacha12-neon",
++ .base.cra_priority = 300,
++ .base.cra_blocksize = 1,
++ .base.cra_ctxsize = sizeof(struct chacha_ctx),
++ .base.cra_module = THIS_MODULE,
++
++ .min_keysize = CHACHA_KEY_SIZE,
++ .max_keysize = CHACHA_KEY_SIZE,
++ .ivsize = XCHACHA_IV_SIZE,
++ .chunksize = CHACHA_BLOCK_SIZE,
++ .walksize = 4 * CHACHA_BLOCK_SIZE,
++ .setkey = chacha12_setkey,
++ .encrypt = xchacha_neon,
++ .decrypt = xchacha_neon,
++ }
++};
++
++static int __init chacha_simd_mod_init(void)
++{
++ int err;
++
++ err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++ if (err)
++ return err;
++
++ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
++ int i;
++
++ switch (read_cpuid_part()) {
++ case ARM_CPU_PART_CORTEX_A7:
++ case ARM_CPU_PART_CORTEX_A5:
++ /*
++ * The Cortex-A7 and Cortex-A5 do not perform well with
++ * the NEON implementation but do incredibly with the
++ * scalar one and use less power.
++ */
++ for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
++ neon_algs[i].base.cra_priority = 0;
++ break;
++ }
++
++ err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++ if (err)
++ crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++ }
++ return err;
++}
++
++static void __exit chacha_simd_mod_fini(void)
++{
++ crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
++ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
++ crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
++}
++
++module_init(chacha_simd_mod_init);
++module_exit(chacha_simd_mod_fini);
++
++MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
++MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("chacha20");
++MODULE_ALIAS_CRYPTO("chacha20-arm");
++MODULE_ALIAS_CRYPTO("xchacha20");
++MODULE_ALIAS_CRYPTO("xchacha20-arm");
++MODULE_ALIAS_CRYPTO("xchacha12");
++MODULE_ALIAS_CRYPTO("xchacha12-arm");
++#ifdef CONFIG_KERNEL_MODE_NEON
++MODULE_ALIAS_CRYPTO("chacha20-neon");
++MODULE_ALIAS_CRYPTO("xchacha20-neon");
++MODULE_ALIAS_CRYPTO("xchacha12-neon");
++#endif
+--- a/arch/arm/crypto/chacha-neon-glue.c
++++ /dev/null
+@@ -1,202 +0,0 @@
+-/*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+- * including ChaCha20 (RFC7539)
+- *
+- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * Based on:
+- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+- *
+- * Copyright (C) 2015 Martin Willi
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License as published by
+- * the Free Software Foundation; either version 2 of the License, or
+- * (at your option) any later version.
+- */
+-
+-#include <crypto/algapi.h>
+-#include <crypto/internal/chacha.h>
+-#include <crypto/internal/simd.h>
+-#include <crypto/internal/skcipher.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-
+-#include <asm/hwcap.h>
+-#include <asm/neon.h>
+-#include <asm/simd.h>
+-
+-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+- int nrounds);
+-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+- int nrounds);
+-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+-
+-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+- unsigned int bytes, int nrounds)
+-{
+- u8 buf[CHACHA_BLOCK_SIZE];
+-
+- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+- chacha_4block_xor_neon(state, dst, src, nrounds);
+- bytes -= CHACHA_BLOCK_SIZE * 4;
+- src += CHACHA_BLOCK_SIZE * 4;
+- dst += CHACHA_BLOCK_SIZE * 4;
+- state[12] += 4;
+- }
+- while (bytes >= CHACHA_BLOCK_SIZE) {
+- chacha_block_xor_neon(state, dst, src, nrounds);
+- bytes -= CHACHA_BLOCK_SIZE;
+- src += CHACHA_BLOCK_SIZE;
+- dst += CHACHA_BLOCK_SIZE;
+- state[12]++;
+- }
+- if (bytes) {
+- memcpy(buf, src, bytes);
+- chacha_block_xor_neon(state, buf, buf, nrounds);
+- memcpy(dst, buf, bytes);
+- }
+-}
+-
+-static int chacha_neon_stream_xor(struct skcipher_request *req,
+- const struct chacha_ctx *ctx, const u8 *iv)
+-{
+- struct skcipher_walk walk;
+- u32 state[16];
+- int err;
+-
+- err = skcipher_walk_virt(&walk, req, false);
+-
+- crypto_chacha_init(state, ctx, iv);
+-
+- while (walk.nbytes > 0) {
+- unsigned int nbytes = walk.nbytes;
+-
+- if (nbytes < walk.total)
+- nbytes = round_down(nbytes, walk.stride);
+-
+- kernel_neon_begin();
+- chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+- nbytes, ctx->nrounds);
+- kernel_neon_end();
+- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+- }
+-
+- return err;
+-}
+-
+-static int chacha_neon(struct skcipher_request *req)
+-{
+- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-
+- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+- return crypto_chacha_crypt(req);
+-
+- return chacha_neon_stream_xor(req, ctx, req->iv);
+-}
+-
+-static int xchacha_neon(struct skcipher_request *req)
+-{
+- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+- struct chacha_ctx subctx;
+- u32 state[16];
+- u8 real_iv[16];
+-
+- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+- return crypto_xchacha_crypt(req);
+-
+- crypto_chacha_init(state, ctx, req->iv);
+-
+- kernel_neon_begin();
+- hchacha_block_neon(state, subctx.key, ctx->nrounds);
+- kernel_neon_end();
+- subctx.nrounds = ctx->nrounds;
+-
+- memcpy(&real_iv[0], req->iv + 24, 8);
+- memcpy(&real_iv[8], req->iv + 16, 8);
+- return chacha_neon_stream_xor(req, &subctx, real_iv);
+-}
+-
+-static struct skcipher_alg algs[] = {
+- {
+- .base.cra_name = "chacha20",
+- .base.cra_driver_name = "chacha20-neon",
+- .base.cra_priority = 300,
+- .base.cra_blocksize = 1,
+- .base.cra_ctxsize = sizeof(struct chacha_ctx),
+- .base.cra_module = THIS_MODULE,
+-
+- .min_keysize = CHACHA_KEY_SIZE,
+- .max_keysize = CHACHA_KEY_SIZE,
+- .ivsize = CHACHA_IV_SIZE,
+- .chunksize = CHACHA_BLOCK_SIZE,
+- .walksize = 4 * CHACHA_BLOCK_SIZE,
+- .setkey = crypto_chacha20_setkey,
+- .encrypt = chacha_neon,
+- .decrypt = chacha_neon,
+- }, {
+- .base.cra_name = "xchacha20",
+- .base.cra_driver_name = "xchacha20-neon",
+- .base.cra_priority = 300,
+- .base.cra_blocksize = 1,
+- .base.cra_ctxsize = sizeof(struct chacha_ctx),
+- .base.cra_module = THIS_MODULE,
+-
+- .min_keysize = CHACHA_KEY_SIZE,
+- .max_keysize = CHACHA_KEY_SIZE,
+- .ivsize = XCHACHA_IV_SIZE,
+- .chunksize = CHACHA_BLOCK_SIZE,
+- .walksize = 4 * CHACHA_BLOCK_SIZE,
+- .setkey = crypto_chacha20_setkey,
+- .encrypt = xchacha_neon,
+- .decrypt = xchacha_neon,
+- }, {
+- .base.cra_name = "xchacha12",
+- .base.cra_driver_name = "xchacha12-neon",
+- .base.cra_priority = 300,
+- .base.cra_blocksize = 1,
+- .base.cra_ctxsize = sizeof(struct chacha_ctx),
+- .base.cra_module = THIS_MODULE,
+-
+- .min_keysize = CHACHA_KEY_SIZE,
+- .max_keysize = CHACHA_KEY_SIZE,
+- .ivsize = XCHACHA_IV_SIZE,
+- .chunksize = CHACHA_BLOCK_SIZE,
+- .walksize = 4 * CHACHA_BLOCK_SIZE,
+- .setkey = crypto_chacha12_setkey,
+- .encrypt = xchacha_neon,
+- .decrypt = xchacha_neon,
+- }
+-};
+-
+-static int __init chacha_simd_mod_init(void)
+-{
+- if (!(elf_hwcap & HWCAP_NEON))
+- return -ENODEV;
+-
+- return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-static void __exit chacha_simd_mod_fini(void)
+-{
+- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-module_init(chacha_simd_mod_init);
+-module_exit(chacha_simd_mod_fini);
+-
+-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+-MODULE_LICENSE("GPL v2");
+-MODULE_ALIAS_CRYPTO("chacha20");
+-MODULE_ALIAS_CRYPTO("chacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha20");
+-MODULE_ALIAS_CRYPTO("xchacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha12");
+-MODULE_ALIAS_CRYPTO("xchacha12-neon");
+--- a/arch/arm/crypto/chacha-scalar-core.S
++++ b/arch/arm/crypto/chacha-scalar-core.S
+@@ -41,14 +41,6 @@
+ X14 .req r12
+ X15 .req r14
+
+-.Lexpand_32byte_k:
+- // "expand 32-byte k"
+- .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+-
+-#ifdef __thumb2__
+-# define adrl adr
+-#endif
+-
+ .macro __rev out, in, t0, t1, t2
+ .if __LINUX_ARM_ARCH__ >= 6
+ rev \out, \in
+@@ -391,61 +383,65 @@
+ .endm // _chacha
+
+ /*
+- * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+- * const u32 iv[4]);
++ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
++ * const u32 *state, int nrounds);
+ */
+-ENTRY(chacha20_arm)
++ENTRY(chacha_doarm)
+ cmp r2, #0 // len == 0?
+ reteq lr
+
++ ldr ip, [sp]
++ cmp ip, #12
++
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+- ldr r4, [sp, #48] // iv
+- mov r0, sp
+- sub sp, #80
+-
+- // iv: x12-x15
+- ldm r4, {X12,X13,X14,X15}
+- stmdb r0!, {X12,X13,X14,X15}
++ add X12, r3, #48
++ ldm X12, {X12,X13,X14,X15}
++ push {X12,X13,X14,X15}
++ sub sp, sp, #64
+
+- // key: x4-x11
+- __ldrd X8_X10, X9_X11, r3, 24
++ __ldrd X8_X10, X9_X11, r3, 40
+ __strd X8_X10, X9_X11, sp, 8
+- stmdb r0!, {X8_X10, X9_X11}
+- ldm r3, {X4-X9_X11}
+- stmdb r0!, {X4-X9_X11}
+-
+- // constants: x0-x3
+- adrl X3, .Lexpand_32byte_k
+- ldm X3, {X0-X3}
++ __strd X8_X10, X9_X11, sp, 56
++ ldm r3, {X0-X9_X11}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
++ __strd X4, X5, sp, 32
++ __strd X6, X7, sp, 40
++ __strd X8_X10, X9_X11, sp, 48
+
++ beq 1f
+ _chacha 20
+
+- add sp, #76
++0: add sp, #76
+ pop {r4-r11, pc}
+-ENDPROC(chacha20_arm)
++
++1: _chacha 12
++ b 0b
++ENDPROC(chacha_doarm)
+
+ /*
+- * void hchacha20_arm(const u32 state[16], u32 out[8]);
++ * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+ */
+-ENTRY(hchacha20_arm)
++ENTRY(hchacha_block_arm)
+ push {r1,r4-r11,lr}
+
++ cmp r2, #12 // ChaCha12 ?
++
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
++ beq 1f
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+- add sp, #16
++0: add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+@@ -458,4 +454,7 @@ ENTRY(hchacha20_arm)
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+-ENDPROC(hchacha20_arm)
++
++1: _chacha_permute 12
++ b 0b
++ENDPROC(hchacha_block_arm)
+--- a/arch/arm64/crypto/chacha-neon-glue.c
++++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -1,5 +1,5 @@
+ /*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
++ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>