aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch148
1 files changed, 148 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
new file mode 100644
index 0000000000..652439393b
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
@@ -0,0 +1,148 @@
+From 833ca409e17c10f4affb5879e22a03fdf1933439 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Wed, 8 Jul 2020 12:11:18 +0300
+Subject: [PATCH 059/124] crypto: x86/chacha-sse3 - use unaligned loads for
+ state array
+
+commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
+
+Due to the fact that the x86 port does not support allocating objects
+on the stack with an alignment that exceeds 8 bytes, we have a rather
+ugly hack in the x86 code for ChaCha to ensure that the state array is
+aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
+to use aligned loads.
+
+Given that the performance benefit of using of aligned loads appears to
+be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
+the fact that this hack has leaked into generic ChaCha code, let's just
+remove it.
+
+Cc: Martin Willi <martin@strongswan.org>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Eric Biggers <ebiggers@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Reviewed-by: Martin Willi <martin@strongswan.org>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
+ arch/x86/crypto/chacha_glue.c | 17 ++---------------
+ include/crypto/chacha.h | 4 ----
+ 3 files changed, 10 insertions(+), 27 deletions(-)
+
+--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
++++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
+@@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+- movdqa 0x00(%rdi),%xmm0
+- movdqa 0x10(%rdi),%xmm1
+- movdqa 0x20(%rdi),%xmm2
+- movdqa 0x30(%rdi),%xmm3
++ movdqu 0x00(%rdi),%xmm0
++ movdqu 0x10(%rdi),%xmm1
++ movdqu 0x20(%rdi),%xmm2
++ movdqu 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+@@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
+ # %edx: nrounds
+ FRAME_BEGIN
+
+- movdqa 0x00(%rdi),%xmm0
+- movdqa 0x10(%rdi),%xmm1
+- movdqa 0x20(%rdi),%xmm2
+- movdqa 0x30(%rdi),%xmm3
++ movdqu 0x00(%rdi),%xmm0
++ movdqu 0x10(%rdi),%xmm1
++ movdqu 0x20(%rdi),%xmm2
++ movdqu 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+--- a/arch/x86/crypto/chacha_glue.c
++++ b/arch/x86/crypto/chacha_glue.c
+@@ -14,8 +14,6 @@
+ #include <linux/module.h>
+ #include <asm/simd.h>
+
+-#define CHACHA_STATE_ALIGN 16
+-
+ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
+
+ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+ {
+- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
+
+ void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+ {
+- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ chacha_init_generic(state, key, iv);
+ }
+ EXPORT_SYMBOL(chacha_init_arch);
+@@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
+ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+ {
+- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+ bytes <= CHACHA_BLOCK_SIZE)
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+@@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
+ static int chacha_simd_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+ {
+- u32 *state, state_buf[16 + 2] __aligned(8);
++ u32 state[CHACHA_STATE_WORDS] __aligned(8);
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+-
+ chacha_init_generic(state, ctx->key, iv);
+
+ while (walk.nbytes > 0) {
+@@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
+ {
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+- u32 *state, state_buf[16 + 2] __aligned(8);
++ u32 state[CHACHA_STATE_WORDS] __aligned(8);
+ struct chacha_ctx subctx;
+ u8 real_iv[16];
+
+- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ chacha_init_generic(state, ctx->key, req->iv);
+
+ if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+--- a/include/crypto/chacha.h
++++ b/include/crypto/chacha.h
+@@ -25,11 +25,7 @@
+ #define CHACHA_BLOCK_SIZE 64
+ #define CHACHAPOLY_IV_SIZE 12
+
+-#ifdef CONFIG_X86_64
+-#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
+-#else
+ #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
+-#endif
+
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE 32