aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch')
-rw-r--r--target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch452
1 files changed, 452 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch b/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
new file mode 100644
index 0000000000..e6fb4d9dc9
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
@@ -0,0 +1,452 @@
+From f9b4c68865fdb7f3327f7d82fbc82c76c8773d53 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:16 +0100
+Subject: [PATCH 010/124] crypto: mips/chacha - import 32r2 ChaCha code from
+ Zinc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
+
+This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
+Zinc patch set.
+
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
+ 1 file changed, 424 insertions(+)
+ create mode 100644 arch/mips/crypto/chacha-core.S
+
+--- /dev/null
++++ b/arch/mips/crypto/chacha-core.S
+@@ -0,0 +1,424 @@
++/* SPDX-License-Identifier: GPL-2.0 OR MIT */
++/*
++ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
++ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
++ */
++
++#define MASK_U32 0x3c
++#define CHACHA20_BLOCK_SIZE 64
++#define STACK_SIZE 32
++
++#define X0 $t0
++#define X1 $t1
++#define X2 $t2
++#define X3 $t3
++#define X4 $t4
++#define X5 $t5
++#define X6 $t6
++#define X7 $t7
++#define X8 $t8
++#define X9 $t9
++#define X10 $v1
++#define X11 $s6
++#define X12 $s5
++#define X13 $s4
++#define X14 $s3
++#define X15 $s2
++/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
++#define T0 $s1
++#define T1 $s0
++#define T(n) T ## n
++#define X(n) X ## n
++
++/* Input arguments */
++#define STATE $a0
++#define OUT $a1
++#define IN $a2
++#define BYTES $a3
++
++/* Output argument */
++/* NONCE[0] is kept in a register and not in memory.
++ * We don't want to touch original value in memory.
++ * Must be incremented every loop iteration.
++ */
++#define NONCE_0 $v0
++
++/* SAVED_X and SAVED_CA are set in the jump table.
++ * Use regs which are overwritten on exit else we don't leak clear data.
++ * They are used to handling the last bytes which are not multiple of 4.
++ */
++#define SAVED_X X15
++#define SAVED_CA $s7
++
++#define IS_UNALIGNED $s7
++
++#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#define MSB 0
++#define LSB 3
++#define ROTx rotl
++#define ROTR(n) rotr n, 24
++#define CPU_TO_LE32(n) \
++ wsbh n; \
++ rotr n, 16;
++#else
++#define MSB 3
++#define LSB 0
++#define ROTx rotr
++#define CPU_TO_LE32(n)
++#define ROTR(n)
++#endif
++
++#define FOR_EACH_WORD(x) \
++ x( 0); \
++ x( 1); \
++ x( 2); \
++ x( 3); \
++ x( 4); \
++ x( 5); \
++ x( 6); \
++ x( 7); \
++ x( 8); \
++ x( 9); \
++ x(10); \
++ x(11); \
++ x(12); \
++ x(13); \
++ x(14); \
++ x(15);
++
++#define FOR_EACH_WORD_REV(x) \
++ x(15); \
++ x(14); \
++ x(13); \
++ x(12); \
++ x(11); \
++ x(10); \
++ x( 9); \
++ x( 8); \
++ x( 7); \
++ x( 6); \
++ x( 5); \
++ x( 4); \
++ x( 3); \
++ x( 2); \
++ x( 1); \
++ x( 0);
++
++#define PLUS_ONE_0 1
++#define PLUS_ONE_1 2
++#define PLUS_ONE_2 3
++#define PLUS_ONE_3 4
++#define PLUS_ONE_4 5
++#define PLUS_ONE_5 6
++#define PLUS_ONE_6 7
++#define PLUS_ONE_7 8
++#define PLUS_ONE_8 9
++#define PLUS_ONE_9 10
++#define PLUS_ONE_10 11
++#define PLUS_ONE_11 12
++#define PLUS_ONE_12 13
++#define PLUS_ONE_13 14
++#define PLUS_ONE_14 15
++#define PLUS_ONE_15 16
++#define PLUS_ONE(x) PLUS_ONE_ ## x
++#define _CONCAT3(a,b,c) a ## b ## c
++#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
++
++#define STORE_UNALIGNED(x) \
++CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
++ .if (x != 12); \
++ lw T0, (x*4)(STATE); \
++ .endif; \
++ lwl T1, (x*4)+MSB ## (IN); \
++ lwr T1, (x*4)+LSB ## (IN); \
++ .if (x == 12); \
++ addu X ## x, NONCE_0; \
++ .else; \
++ addu X ## x, T0; \
++ .endif; \
++ CPU_TO_LE32(X ## x); \
++ xor X ## x, T1; \
++ swl X ## x, (x*4)+MSB ## (OUT); \
++ swr X ## x, (x*4)+LSB ## (OUT);
++
++#define STORE_ALIGNED(x) \
++CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
++ .if (x != 12); \
++ lw T0, (x*4)(STATE); \
++ .endif; \
++ lw T1, (x*4) ## (IN); \
++ .if (x == 12); \
++ addu X ## x, NONCE_0; \
++ .else; \
++ addu X ## x, T0; \
++ .endif; \
++ CPU_TO_LE32(X ## x); \
++ xor X ## x, T1; \
++ sw X ## x, (x*4) ## (OUT);
++
++/* Jump table macro.
++ * Used for setup and handling the last bytes, which are not multiple of 4.
++ * X15 is free to store Xn
++ * Every jumptable entry must be equal in size.
++ */
++#define JMPTBL_ALIGNED(x) \
++.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
++ .set noreorder; \
++ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
++ .if (x == 12); \
++ addu SAVED_X, X ## x, NONCE_0; \
++ .else; \
++ addu SAVED_X, X ## x, SAVED_CA; \
++ .endif; \
++ .set reorder
++
++#define JMPTBL_UNALIGNED(x) \
++.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
++ .set noreorder; \
++ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
++ .if (x == 12); \
++ addu SAVED_X, X ## x, NONCE_0; \
++ .else; \
++ addu SAVED_X, X ## x, SAVED_CA; \
++ .endif; \
++ .set reorder
++
++#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
++ addu X(A), X(K); \
++ addu X(B), X(L); \
++ addu X(C), X(M); \
++ addu X(D), X(N); \
++ xor X(V), X(A); \
++ xor X(W), X(B); \
++ xor X(Y), X(C); \
++ xor X(Z), X(D); \
++ rotl X(V), S; \
++ rotl X(W), S; \
++ rotl X(Y), S; \
++ rotl X(Z), S;
++
++.text
++.set reorder
++.set noat
++.globl chacha20_mips
++.ent chacha20_mips
++chacha20_mips:
++ .frame $sp, STACK_SIZE, $ra
++
++ addiu $sp, -STACK_SIZE
++
++ /* Return bytes = 0. */
++ beqz BYTES, .Lchacha20_mips_end
++
++ lw NONCE_0, 48(STATE)
++
++ /* Save s0-s7 */
++ sw $s0, 0($sp)
++ sw $s1, 4($sp)
++ sw $s2, 8($sp)
++ sw $s3, 12($sp)
++ sw $s4, 16($sp)
++ sw $s5, 20($sp)
++ sw $s6, 24($sp)
++ sw $s7, 28($sp)
++
++ /* Test IN or OUT is unaligned.
++ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
++ */
++ or IS_UNALIGNED, IN, OUT
++ andi IS_UNALIGNED, 0x3
++
++ /* Set number of rounds */
++ li $at, 20
++
++ b .Lchacha20_rounds_start
++
++.align 4
++.Loop_chacha20_rounds:
++ addiu IN, CHACHA20_BLOCK_SIZE
++ addiu OUT, CHACHA20_BLOCK_SIZE
++ addiu NONCE_0, 1
++
++.Lchacha20_rounds_start:
++ lw X0, 0(STATE)
++ lw X1, 4(STATE)
++ lw X2, 8(STATE)
++ lw X3, 12(STATE)
++
++ lw X4, 16(STATE)
++ lw X5, 20(STATE)
++ lw X6, 24(STATE)
++ lw X7, 28(STATE)
++ lw X8, 32(STATE)
++ lw X9, 36(STATE)
++ lw X10, 40(STATE)
++ lw X11, 44(STATE)
++
++ move X12, NONCE_0
++ lw X13, 52(STATE)
++ lw X14, 56(STATE)
++ lw X15, 60(STATE)
++
++.Loop_chacha20_xor_rounds:
++ addiu $at, -2
++ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
++ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
++ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
++ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
++ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
++ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
++ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
++ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
++ bnez $at, .Loop_chacha20_xor_rounds
++
++ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
++
++ /* Is data src/dst unaligned? Jump */
++ bnez IS_UNALIGNED, .Loop_chacha20_unaligned
++
++ /* Set number rounds here to fill delayslot. */
++ li $at, 20
++
++ /* BYTES < 0, it has no full block. */
++ bltz BYTES, .Lchacha20_mips_no_full_block_aligned
++
++ FOR_EACH_WORD_REV(STORE_ALIGNED)
++
++ /* BYTES > 0? Loop again. */
++ bgtz BYTES, .Loop_chacha20_rounds
++
++ /* Place this here to fill delay slot */
++ addiu NONCE_0, 1
++
++ /* BYTES < 0? Handle last bytes */
++ bltz BYTES, .Lchacha20_mips_xor_bytes
++
++.Lchacha20_mips_xor_done:
++ /* Restore used registers */
++ lw $s0, 0($sp)
++ lw $s1, 4($sp)
++ lw $s2, 8($sp)
++ lw $s3, 12($sp)
++ lw $s4, 16($sp)
++ lw $s5, 20($sp)
++ lw $s6, 24($sp)
++ lw $s7, 28($sp)
++
++ /* Write NONCE_0 back to right location in state */
++ sw NONCE_0, 48(STATE)
++
++.Lchacha20_mips_end:
++ addiu $sp, STACK_SIZE
++ jr $ra
++
++.Lchacha20_mips_no_full_block_aligned:
++ /* Restore the offset on BYTES */
++ addiu BYTES, CHACHA20_BLOCK_SIZE
++
++ /* Get number of full WORDS */
++ andi $at, BYTES, MASK_U32
++
++ /* Load upper half of jump table addr */
++ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
++
++ /* Calculate lower half jump table offset */
++ ins T0, $at, 1, 6
++
++ /* Add offset to STATE */
++ addu T1, STATE, $at
++
++ /* Add lower half jump table addr */
++ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
++
++ /* Read value from STATE */
++ lw SAVED_CA, 0(T1)
++
++ /* Store remaining bytecounter as negative value */
++ subu BYTES, $at, BYTES
++
++ jr T0
++
++ /* Jump table */
++ FOR_EACH_WORD(JMPTBL_ALIGNED)
++
++
++.Loop_chacha20_unaligned:
++ /* Set number rounds here to fill delayslot. */
++ li $at, 20
++
++ /* BYTES > 0, it has no full block. */
++ bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
++
++ FOR_EACH_WORD_REV(STORE_UNALIGNED)
++
++ /* BYTES > 0? Loop again. */
++ bgtz BYTES, .Loop_chacha20_rounds
++
++ /* Write NONCE_0 back to right location in state */
++ sw NONCE_0, 48(STATE)
++
++ .set noreorder
++ /* Fall through to byte handling */
++ bgez BYTES, .Lchacha20_mips_xor_done
++.Lchacha20_mips_xor_unaligned_0_b:
++.Lchacha20_mips_xor_aligned_0_b:
++ /* Place this here to fill delay slot */
++ addiu NONCE_0, 1
++ .set reorder
++
++.Lchacha20_mips_xor_bytes:
++ addu IN, $at
++ addu OUT, $at
++ /* First byte */
++ lbu T1, 0(IN)
++ addiu $at, BYTES, 1
++ CPU_TO_LE32(SAVED_X)
++ ROTR(SAVED_X)
++ xor T1, SAVED_X
++ sb T1, 0(OUT)
++ beqz $at, .Lchacha20_mips_xor_done
++ /* Second byte */
++ lbu T1, 1(IN)
++ addiu $at, BYTES, 2
++ ROTx SAVED_X, 8
++ xor T1, SAVED_X
++ sb T1, 1(OUT)
++ beqz $at, .Lchacha20_mips_xor_done
++ /* Third byte */
++ lbu T1, 2(IN)
++ ROTx SAVED_X, 8
++ xor T1, SAVED_X
++ sb T1, 2(OUT)
++ b .Lchacha20_mips_xor_done
++
++.Lchacha20_mips_no_full_block_unaligned:
++ /* Restore the offset on BYTES */
++ addiu BYTES, CHACHA20_BLOCK_SIZE
++
++ /* Get number of full WORDS */
++ andi $at, BYTES, MASK_U32
++
++ /* Load upper half of jump table addr */
++ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
++
++ /* Calculate lower half jump table offset */
++ ins T0, $at, 1, 6
++
++ /* Add offset to STATE */
++ addu T1, STATE, $at
++
++ /* Add lower half jump table addr */
++ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
++
++ /* Read value from STATE */
++ lw SAVED_CA, 0(T1)
++
++ /* Store remaining bytecounter as negative value */
++ subu BYTES, $at, BYTES
++
++ jr T0
++
++ /* Jump table */
++ FOR_EACH_WORD(JMPTBL_UNALIGNED)
++.end chacha20_mips
++.set at