kernel: 5.4: import wireguard backport

Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
author: Jason A. Donenfeld <Jason@zx2c4.com> 2021-02-19 14:29:04 +0100
committer: David Bauer <mail@david-bauer.net> 2021-02-26 20:41:01 +0100
commit: 3888fa78802354ab7bbd19b7d061fd80a16ce06b (patch)
tree: 2225a6313cb6482f0cb9c09df662a0d44197350e /target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
parent: 7d4143234c4dfdd050ebc64ec8231f9d81ea65af (diff)
download: upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.gz
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.bz2
upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.zip
1 files changed, 1563 insertions, 0 deletions
diff --git a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
new file mode 100644
index 0000000000..68cac9cc57
--- /dev/null
+++ b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
@@ -0,0 +1,1563 @@
+From a338793df36990e97ab0b824fad6fbf6ef171f94 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:26 +0100
+Subject: [PATCH 020/124] crypto: mips/poly1305 - incorporate
+ OpenSSL/CRYPTOGAMS optimized implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit a11d055e7a64ac34a5e99b6fe731299449cbcd58 upstream.
+
+This is a straight import of the OpenSSL/CRYPTOGAMS Poly1305 implementation for
+MIPS authored by Andy Polyakov, a prior 64-bit only version of which has been
+contributed by him to the OpenSSL project. The file 'poly1305-mips.pl' is taken
+straight from this upstream GitHub repository [0] at commit
+d22ade312a7af958ec955620b0d241cf42c37feb, and already contains all the changes
+required to build it as part of a Linux kernel module.
+
+[0] https://github.com/dot-asm/cryptogams
+
+Co-developed-by: Andy Polyakov <appro@cryptogams.org>
+Signed-off-by: Andy Polyakov <appro@cryptogams.org>
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/Makefile         |   14 +
+ arch/mips/crypto/poly1305-glue.c  |  203 +++++
+ arch/mips/crypto/poly1305-mips.pl | 1273 +++++++++++++++++++++++++++++
+ crypto/Kconfig                    |    5 +
+ lib/crypto/Kconfig                |    1 +
+ 5 files changed, 1496 insertions(+)
+ create mode 100644 arch/mips/crypto/poly1305-glue.c
+ create mode 100644 arch/mips/crypto/poly1305-mips.pl
+
+--- a/arch/mips/crypto/Makefile
++++ b/arch/mips/crypto/Makefile
+@@ -8,3 +8,17 @@ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32
+ obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+ chacha-mips-y := chacha-core.o chacha-glue.o
+ AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
++
++obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
++poly1305-mips-y := poly1305-core.o poly1305-glue.o
++
++perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
++perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
++
++quiet_cmd_perlasm = PERLASM $@
++      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
++
++$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
++	$(call if_changed,perlasm)
++
++targets += poly1305-core.S
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-glue.c
+@@ -0,0 +1,203 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
++ *
++ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
++ */
++
++#include <asm/unaligned.h>
++#include <crypto/algapi.h>
++#include <crypto/internal/hash.h>
++#include <crypto/internal/poly1305.h>
++#include <linux/cpufeature.h>
++#include <linux/crypto.h>
++#include <linux/module.h>
++
++asmlinkage void poly1305_init_mips(void *state, const u8 *key);
++asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
++asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
++
++void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
++{
++	poly1305_init_mips(&dctx->h, key);
++	dctx->s[0] = get_unaligned_le32(key + 16);
++	dctx->s[1] = get_unaligned_le32(key + 20);
++	dctx->s[2] = get_unaligned_le32(key + 24);
++	dctx->s[3] = get_unaligned_le32(key + 28);
++	dctx->buflen = 0;
++}
++EXPORT_SYMBOL(poly1305_init_arch);
++
++static int mips_poly1305_init(struct shash_desc *desc)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	dctx->buflen = 0;
++	dctx->rset = 0;
++	dctx->sset = false;
++
++	return 0;
++}
++
++static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
++				 u32 len, u32 hibit)
++{
++	if (unlikely(!dctx->sset)) {
++		if (!dctx->rset) {
++			poly1305_init_mips(&dctx->h, src);
++			src += POLY1305_BLOCK_SIZE;
++			len -= POLY1305_BLOCK_SIZE;
++			dctx->rset = 1;
++		}
++		if (len >= POLY1305_BLOCK_SIZE) {
++			dctx->s[0] = get_unaligned_le32(src +  0);
++			dctx->s[1] = get_unaligned_le32(src +  4);
++			dctx->s[2] = get_unaligned_le32(src +  8);
++			dctx->s[3] = get_unaligned_le32(src + 12);
++			src += POLY1305_BLOCK_SIZE;
++			len -= POLY1305_BLOCK_SIZE;
++			dctx->sset = true;
++		}
++		if (len < POLY1305_BLOCK_SIZE)
++			return;
++	}
++
++	len &= ~(POLY1305_BLOCK_SIZE - 1);
++
++	poly1305_blocks_mips(&dctx->h, src, len, hibit);
++}
++
++static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
++				unsigned int len)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	if (unlikely(dctx->buflen)) {
++		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++		memcpy(dctx->buf + dctx->buflen, src, bytes);
++		src += bytes;
++		len -= bytes;
++		dctx->buflen += bytes;
++
++		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++			mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
++			dctx->buflen = 0;
++		}
++	}
++
++	if (likely(len >= POLY1305_BLOCK_SIZE)) {
++		mips_poly1305_blocks(dctx, src, len, 1);
++		src += round_down(len, POLY1305_BLOCK_SIZE);
++		len %= POLY1305_BLOCK_SIZE;
++	}
++
++	if (unlikely(len)) {
++		dctx->buflen = len;
++		memcpy(dctx->buf, src, len);
++	}
++	return 0;
++}
++
++void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
++			  unsigned int nbytes)
++{
++	if (unlikely(dctx->buflen)) {
++		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
++
++		memcpy(dctx->buf + dctx->buflen, src, bytes);
++		src += bytes;
++		nbytes -= bytes;
++		dctx->buflen += bytes;
++
++		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
++			poly1305_blocks_mips(&dctx->h, dctx->buf,
++					     POLY1305_BLOCK_SIZE, 1);
++			dctx->buflen = 0;
++		}
++	}
++
++	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
++		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
++
++		poly1305_blocks_mips(&dctx->h, src, len, 1);
++		src += len;
++		nbytes %= POLY1305_BLOCK_SIZE;
++	}
++
++	if (unlikely(nbytes)) {
++		dctx->buflen = nbytes;
++		memcpy(dctx->buf, src, nbytes);
++	}
++}
++EXPORT_SYMBOL(poly1305_update_arch);
++
++void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
++{
++	__le32 digest[4];
++	u64 f = 0;
++
++	if (unlikely(dctx->buflen)) {
++		dctx->buf[dctx->buflen++] = 1;
++		memset(dctx->buf + dctx->buflen, 0,
++		       POLY1305_BLOCK_SIZE - dctx->buflen);
++		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
++	}
++
++	poly1305_emit_mips(&dctx->h, digest, dctx->s);
++
++	/* mac = (h + s) % (2^128) */
++	f = (f >> 32) + le32_to_cpu(digest[0]);
++	put_unaligned_le32(f, dst);
++	f = (f >> 32) + le32_to_cpu(digest[1]);
++	put_unaligned_le32(f, dst + 4);
++	f = (f >> 32) + le32_to_cpu(digest[2]);
++	put_unaligned_le32(f, dst + 8);
++	f = (f >> 32) + le32_to_cpu(digest[3]);
++	put_unaligned_le32(f, dst + 12);
++
++	*dctx = (struct poly1305_desc_ctx){};
++}
++EXPORT_SYMBOL(poly1305_final_arch);
++
++static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
++{
++	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
++
++	if (unlikely(!dctx->sset))
++		return -ENOKEY;
++
++	poly1305_final_arch(dctx, dst);
++	return 0;
++}
++
++static struct shash_alg mips_poly1305_alg = {
++	.init			= mips_poly1305_init,
++	.update			= mips_poly1305_update,
++	.final			= mips_poly1305_final,
++	.digestsize		= POLY1305_DIGEST_SIZE,
++	.descsize		= sizeof(struct poly1305_desc_ctx),
++
++	.base.cra_name		= "poly1305",
++	.base.cra_driver_name	= "poly1305-mips",
++	.base.cra_priority	= 200,
++	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
++	.base.cra_module	= THIS_MODULE,
++};
++
++static int __init mips_poly1305_mod_init(void)
++{
++	return crypto_register_shash(&mips_poly1305_alg);
++}
++
++static void __exit mips_poly1305_mod_exit(void)
++{
++	crypto_unregister_shash(&mips_poly1305_alg);
++}
++
++module_init(mips_poly1305_mod_init);
++module_exit(mips_poly1305_mod_exit);
++
++MODULE_LICENSE("GPL v2");
++MODULE_ALIAS_CRYPTO("poly1305");
++MODULE_ALIAS_CRYPTO("poly1305-mips");
+--- /dev/null
++++ b/arch/mips/crypto/poly1305-mips.pl
+@@ -0,0 +1,1273 @@
++#!/usr/bin/env perl
++# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
++#
++# ====================================================================
++# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
++# project.
++# ====================================================================
++
++# Poly1305 hash for MIPS.
++#
++# May 2016
++#
++# Numbers are cycles per processed byte with poly1305_blocks alone.
++#
++#		IALU/gcc
++# R1x000	~5.5/+130%	(big-endian)
++# Octeon II	2.50/+70%	(little-endian)
++#
++# March 2019
++#
++# Add 32-bit code path.
++#
++# October 2019
++#
++# Modulo-scheduling reduction allows to omit dependency chain at the
++# end of inner loop and improve performance. Also optimize MIPS32R2
++# code path for MIPS 1004K core. Per René von Dorst's suggestions.
++#
++#		IALU/gcc
++# R1x000	~9.8/?		(big-endian)
++# Octeon II	3.65/+140%	(little-endian)
++# MT7621/1004K	4.75/?		(little-endian)
++#
++######################################################################
++# There is a number of MIPS ABI in use, O32 and N32/64 are most
++# widely used. Then there is a new contender: NUBI. It appears that if
++# one picks the latter, it's possible to arrange code in ABI neutral
++# manner. Therefore let's stick to NUBI register layout:
++#
++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
++#
++# The return value is placed in $a0. Following coding rules facilitate
++# interoperability:
++#
++# - never ever touch $tp, "thread pointer", former $gp [o32 can be
++#   excluded from the rule, because it's specified volatile];
++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
++#   old code];
++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
++#
++# For reference here is register layout for N32/64 MIPS ABIs:
++#
++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
++#
++# <appro@openssl.org>
++#
++######################################################################
++
++$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
++
++$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
++
++if ($flavour =~ /64|n32/i) {{{
++######################################################################
++# 64-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
++     defined(_MIPS_ARCH_MIPS64R6)) \\
++     && !defined(_MIPS_ARCH_MIPS64R2)
++# define _MIPS_ARCH_MIPS64R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++# define dmultu(rs,rt)
++# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
++# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
++#else
++# define dmultu(rs,rt)		dmultu	rs,rt
++# define mflo(rd,rs,rt)	mflo	rd
++# define mfhi(rd,rs,rt)	mfhi	rd
++#endif
++
++#ifdef	__KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 7
++#else
++# define MSB 7
++# define LSB 0
++#endif
++
++.text
++.set	noat
++.set	noreorder
++
++.align	5
++.globl	poly1305_init
++.ent	poly1305_init
++poly1305_init:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	sd	$zero,0($ctx)
++	sd	$zero,8($ctx)
++	sd	$zero,16($ctx)
++
++	beqz	$inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++	andi	$tmp0,$inp,7		# $inp % 8
++	dsubu	$inp,$inp,$tmp0		# align $inp
++	sll	$tmp0,$tmp0,3		# byte to bit offset
++	ld	$in0,0($inp)
++	ld	$in1,8($inp)
++	beqz	$tmp0,.Laligned_key
++	ld	$tmp2,16($inp)
++
++	subu	$tmp1,$zero,$tmp0
++# ifdef	MIPSEB
++	dsllv	$in0,$in0,$tmp0
++	dsrlv	$tmp3,$in1,$tmp1
++	dsllv	$in1,$in1,$tmp0
++	dsrlv	$tmp2,$tmp2,$tmp1
++# else
++	dsrlv	$in0,$in0,$tmp0
++	dsllv	$tmp3,$in1,$tmp1
++	dsrlv	$in1,$in1,$tmp0
++	dsllv	$tmp2,$tmp2,$tmp1
++# endif
++	or	$in0,$in0,$tmp3
++	or	$in1,$in1,$tmp2
++.Laligned_key:
++#else
++	ldl	$in0,0+MSB($inp)
++	ldl	$in1,8+MSB($inp)
++	ldr	$in0,0+LSB($inp)
++	ldr	$in1,8+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++	dsbh	$in0,$in0		# byte swap
++	 dsbh	$in1,$in1
++	dshd	$in0,$in0
++	 dshd	$in1,$in1
++# else
++	ori	$tmp0,$zero,0xFF
++	dsll	$tmp2,$tmp0,32
++	or	$tmp0,$tmp2		# 0x000000FF000000FF
++
++	and	$tmp1,$in0,$tmp0	# byte swap
++	 and	$tmp3,$in1,$tmp0
++	dsrl	$tmp2,$in0,24
++	 dsrl	$tmp4,$in1,24
++	dsll	$tmp1,24
++	 dsll	$tmp3,24
++	and	$tmp2,$tmp0
++	 and	$tmp4,$tmp0
++	dsll	$tmp0,8			# 0x0000FF000000FF00
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	and	$tmp2,$in0,$tmp0
++	 and	$tmp4,$in1,$tmp0
++	dsrl	$in0,8
++	 dsrl	$in1,8
++	dsll	$tmp2,8
++	 dsll	$tmp4,8
++	and	$in0,$tmp0
++	 and	$in1,$tmp0
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++	dsrl	$tmp1,$in0,32
++	 dsrl	$tmp3,$in1,32
++	dsll	$in0,32
++	 dsll	$in1,32
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++# endif
++#endif
++	li	$tmp0,1
++	dsll	$tmp0,32		# 0x0000000100000000
++	daddiu	$tmp0,-63		# 0x00000000ffffffc1
++	dsll	$tmp0,28		# 0x0ffffffc10000000
++	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
++
++	and	$in0,$tmp0
++	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
++	and	$in1,$tmp0
++
++	sd	$in0,24($ctx)
++	dsrl	$tmp0,$in1,2
++	sd	$in1,32($ctx)
++	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
++	sd	$tmp0,40($ctx)
++
++.Lno_key:
++	li	$v0,0			# return 0
++	jr	$ra
++.end	poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
++
++my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
++   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
++my ($shr,$shl) = ($s6,$s7);		# used on R6
++
++$code.=<<___;
++.align	5
++.globl	poly1305_blocks
++.ent	poly1305_blocks
++poly1305_blocks:
++	.set	noreorder
++	dsrl	$len,4			# number of complete blocks
++	bnez	$len,poly1305_blocks_internal
++	nop
++	jr	$ra
++	nop
++.end	poly1305_blocks
++
++.align	5
++.ent	poly1305_blocks_internal
++poly1305_blocks_internal:
++	.set	noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++	.frame	$sp,8*8,$ra
++	.mask	$SAVED_REGS_MASK|0x000c0000,-8
++	dsubu	$sp,8*8
++	sd	$s7,56($sp)
++	sd	$s6,48($sp)
++#else
++	.frame	$sp,6*8,$ra
++	.mask	$SAVED_REGS_MASK,-8
++	dsubu	$sp,6*8
++#endif
++	sd	$s5,40($sp)
++	sd	$s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	sd	$s3,24($sp)
++	sd	$s2,16($sp)
++	sd	$s1,8($sp)
++	sd	$s0,0($sp)
++___
++$code.=<<___;
++	.set	reorder
++
++#if defined(_MIPS_ARCH_MIPS64R6)
++	andi	$shr,$inp,7
++	dsubu	$inp,$inp,$shr		# align $inp
++	sll	$shr,$shr,3		# byte to bit offset
++	subu	$shl,$zero,$shr
++#endif
++
++	ld	$h0,0($ctx)		# load hash value
++	ld	$h1,8($ctx)
++	ld	$h2,16($ctx)
++
++	ld	$r0,24($ctx)		# load key
++	ld	$r1,32($ctx)
++	ld	$rs1,40($ctx)
++
++	dsll	$len,4
++	daddu	$len,$inp		# end of buffer
++	b	.Loop
++
++.align	4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS64R6)
++	ld	$in0,0($inp)		# load input
++	ld	$in1,8($inp)
++	beqz	$shr,.Laligned_inp
++
++	ld	$tmp2,16($inp)
++# ifdef	MIPSEB
++	dsllv	$in0,$in0,$shr
++	dsrlv	$tmp3,$in1,$shl
++	dsllv	$in1,$in1,$shr
++	dsrlv	$tmp2,$tmp2,$shl
++# else
++	dsrlv	$in0,$in0,$shr
++	dsllv	$tmp3,$in1,$shl
++	dsrlv	$in1,$in1,$shr
++	dsllv	$tmp2,$tmp2,$shl
++# endif
++	or	$in0,$in0,$tmp3
++	or	$in1,$in1,$tmp2
++.Laligned_inp:
++#else
++	ldl	$in0,0+MSB($inp)	# load input
++	ldl	$in1,8+MSB($inp)
++	ldr	$in0,0+LSB($inp)
++	ldr	$in1,8+LSB($inp)
++#endif
++	daddiu	$inp,16
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS64R2)
++	dsbh	$in0,$in0		# byte swap
++	 dsbh	$in1,$in1
++	dshd	$in0,$in0
++	 dshd	$in1,$in1
++# else
++	ori	$tmp0,$zero,0xFF
++	dsll	$tmp2,$tmp0,32
++	or	$tmp0,$tmp2		# 0x000000FF000000FF
++
++	and	$tmp1,$in0,$tmp0	# byte swap
++	 and	$tmp3,$in1,$tmp0
++	dsrl	$tmp2,$in0,24
++	 dsrl	$tmp4,$in1,24
++	dsll	$tmp1,24
++	 dsll	$tmp3,24
++	and	$tmp2,$tmp0
++	 and	$tmp4,$tmp0
++	dsll	$tmp0,8			# 0x0000FF000000FF00
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	and	$tmp2,$in0,$tmp0
++	 and	$tmp4,$in1,$tmp0
++	dsrl	$in0,8
++	 dsrl	$in1,8
++	dsll	$tmp2,8
++	 dsll	$tmp4,8
++	and	$in0,$tmp0
++	 and	$in1,$tmp0
++	or	$tmp1,$tmp2
++	 or	$tmp3,$tmp4
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++	dsrl	$tmp1,$in0,32
++	 dsrl	$tmp3,$in1,32
++	dsll	$in0,32
++	 dsll	$in1,32
++	or	$in0,$tmp1
++	 or	$in1,$tmp3
++# endif
++#endif
++	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
++	andi	$h2,$h2,3
++	dsll	$tmp0,$tmp1,2
++
++	daddu	$d0,$h0,$in0		# accumulate input
++	 daddu	$tmp1,$tmp0
++	sltu	$tmp0,$d0,$h0
++	daddu	$d0,$d0,$tmp1		# ... and residue
++	sltu	$tmp1,$d0,$tmp1
++	daddu	$d1,$h1,$in1
++	daddu	$tmp0,$tmp1
++	sltu	$tmp1,$d1,$h1
++	daddu	$d1,$tmp0
++
++	dmultu	($r0,$d0)		# h0*r0
++	 daddu	$d2,$h2,$padbit
++	 sltu	$tmp0,$d1,$tmp0
++	mflo	($h0,$r0,$d0)
++	mfhi	($h1,$r0,$d0)
++
++	dmultu	($rs1,$d1)		# h1*5*r1
++	 daddu	$d2,$tmp1
++	 daddu	$d2,$tmp0
++	mflo	($tmp0,$rs1,$d1)
++	mfhi	($tmp1,$rs1,$d1)
++
++	dmultu	($r1,$d0)		# h0*r1
++	mflo	($tmp2,$r1,$d0)
++	mfhi	($h2,$r1,$d0)
++	 daddu	$h0,$tmp0
++	 daddu	$h1,$tmp1
++	 sltu	$tmp0,$h0,$tmp0
++
++	dmultu	($r0,$d1)		# h1*r0
++	 daddu	$h1,$tmp0
++	 daddu	$h1,$tmp2
++	mflo	($tmp0,$r0,$d1)
++	mfhi	($tmp1,$r0,$d1)
++
++	dmultu	($rs1,$d2)		# h2*5*r1
++	 sltu	$tmp2,$h1,$tmp2
++	 daddu	$h2,$tmp2
++	mflo	($tmp2,$rs1,$d2)
++
++	dmultu	($r0,$d2)		# h2*r0
++	 daddu	$h1,$tmp0
++	 daddu	$h2,$tmp1
++	mflo	($tmp3,$r0,$d2)
++	 sltu	$tmp0,$h1,$tmp0
++	 daddu	$h2,$tmp0
++
++	daddu	$h1,$tmp2
++	sltu	$tmp2,$h1,$tmp2
++	daddu	$h2,$tmp2
++	daddu	$h2,$tmp3
++
++	bne	$inp,$len,.Loop
++
++	sd	$h0,0($ctx)		# store hash value
++	sd	$h1,8($ctx)
++	sd	$h2,16($ctx)
++
++	.set	noreorder
++#if defined(_MIPS_ARCH_MIPS64R6)
++	ld	$s7,56($sp)
++	ld	$s6,48($sp)
++#endif
++	ld	$s5,40($sp)		# epilogue
++	ld	$s4,32($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
++	ld	$s3,24($sp)
++	ld	$s2,16($sp)
++	ld	$s1,8($sp)
++	ld	$s0,0($sp)
++___
++$code.=<<___;
++	jr	$ra
++#if defined(_MIPS_ARCH_MIPS64R6)
++	daddu	$sp,8*8
++#else
++	daddu	$sp,6*8
++#endif
++.end	poly1305_blocks_internal
++___
++}
++{
++my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
++
++$code.=<<___;
++.align	5
++.globl	poly1305_emit
++.ent	poly1305_emit
++poly1305_emit:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	ld	$tmp2,16($ctx)
++	ld	$tmp0,0($ctx)
++	ld	$tmp1,8($ctx)
++
++	li	$in0,-4			# final reduction
++	dsrl	$in1,$tmp2,2
++	and	$in0,$tmp2
++	andi	$tmp2,$tmp2,3
++	daddu	$in0,$in1
++
++	daddu	$tmp0,$tmp0,$in0
++	sltu	$in1,$tmp0,$in0
++	 daddiu	$in0,$tmp0,5		# compare to modulus
++	daddu	$tmp1,$tmp1,$in1
++	 sltiu	$tmp3,$in0,5
++	sltu	$tmp4,$tmp1,$in1
++	 daddu	$in1,$tmp1,$tmp3
++	daddu	$tmp2,$tmp2,$tmp4
++	 sltu	$tmp3,$in1,$tmp3
++	 daddu	$tmp2,$tmp2,$tmp3
++
++	dsrl	$tmp2,2			# see if it carried/borrowed
++	dsubu	$tmp2,$zero,$tmp2
++
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	and	$in0,$tmp2
++	and	$in1,$tmp2
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++
++	lwu	$tmp0,0($nonce)		# load nonce
++	lwu	$tmp1,4($nonce)
++	lwu	$tmp2,8($nonce)
++	lwu	$tmp3,12($nonce)
++	dsll	$tmp1,32
++	dsll	$tmp3,32
++	or	$tmp0,$tmp1
++	or	$tmp2,$tmp3
++
++	daddu	$in0,$tmp0		# accumulate nonce
++	daddu	$in1,$tmp2
++	sltu	$tmp0,$in0,$tmp0
++	daddu	$in1,$tmp0
++
++	dsrl	$tmp0,$in0,8		# write mac value
++	dsrl	$tmp1,$in0,16
++	dsrl	$tmp2,$in0,24
++	sb	$in0,0($mac)
++	dsrl	$tmp3,$in0,32
++	sb	$tmp0,1($mac)
++	dsrl	$tmp0,$in0,40
++	sb	$tmp1,2($mac)
++	dsrl	$tmp1,$in0,48
++	sb	$tmp2,3($mac)
++	dsrl	$tmp2,$in0,56
++	sb	$tmp3,4($mac)
++	dsrl	$tmp3,$in1,8
++	sb	$tmp0,5($mac)
++	dsrl	$tmp0,$in1,16
++	sb	$tmp1,6($mac)
++	dsrl	$tmp1,$in1,24
++	sb	$tmp2,7($mac)
++
++	sb	$in1,8($mac)
++	dsrl	$tmp2,$in1,32
++	sb	$tmp3,9($mac)
++	dsrl	$tmp3,$in1,40
++	sb	$tmp0,10($mac)
++	dsrl	$tmp0,$in1,48
++	sb	$tmp1,11($mac)
++	dsrl	$tmp1,$in1,56
++	sb	$tmp2,12($mac)
++	sb	$tmp3,13($mac)
++	sb	$tmp0,14($mac)
++	sb	$tmp1,15($mac)
++
++	jr	$ra
++.end	poly1305_emit
++.rdata
++.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
++.align	2
++___
++}
++}}} else {{{
++######################################################################
++# 32-bit code path
++#
++
++my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
++my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
++   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
++
++$code.=<<___;
++#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
++     defined(_MIPS_ARCH_MIPS32R6)) \\
++     && !defined(_MIPS_ARCH_MIPS32R2)
++# define _MIPS_ARCH_MIPS32R2
++#endif
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++# define multu(rs,rt)
++# define mflo(rd,rs,rt)	mulu	rd,rs,rt
++# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
++#else
++# define multu(rs,rt)	multu	rs,rt
++# define mflo(rd,rs,rt)	mflo	rd
++# define mfhi(rd,rs,rt)	mfhi	rd
++#endif
++
++#ifdef	__KERNEL__
++# define poly1305_init   poly1305_init_mips
++# define poly1305_blocks poly1305_blocks_mips
++# define poly1305_emit   poly1305_emit_mips
++#endif
++
++#if defined(__MIPSEB__) && !defined(MIPSEB)
++# define MIPSEB
++#endif
++
++#ifdef MIPSEB
++# define MSB 0
++# define LSB 3
++#else
++# define MSB 3
++# define LSB 0
++#endif
++
++.text
++.set	noat
++.set	noreorder
++
++.align	5
++.globl	poly1305_init
++.ent	poly1305_init
++poly1305_init:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	sw	$zero,0($ctx)
++	sw	$zero,4($ctx)
++	sw	$zero,8($ctx)
++	sw	$zero,12($ctx)
++	sw	$zero,16($ctx)
++
++	beqz	$inp,.Lno_key
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++	andi	$tmp0,$inp,3		# $inp % 4
++	subu	$inp,$inp,$tmp0		# align $inp
++	sll	$tmp0,$tmp0,3		# byte to bit offset
++	lw	$in0,0($inp)
++	lw	$in1,4($inp)
++	lw	$in2,8($inp)
++	lw	$in3,12($inp)
++	beqz	$tmp0,.Laligned_key
++
++	lw	$tmp2,16($inp)
++	subu	$tmp1,$zero,$tmp0
++# ifdef	MIPSEB
++	sllv	$in0,$in0,$tmp0
++	srlv	$tmp3,$in1,$tmp1
++	sllv	$in1,$in1,$tmp0
++	or	$in0,$in0,$tmp3
++	srlv	$tmp3,$in2,$tmp1
++	sllv	$in2,$in2,$tmp0
++	or	$in1,$in1,$tmp3
++	srlv	$tmp3,$in3,$tmp1
++	sllv	$in3,$in3,$tmp0
++	or	$in2,$in2,$tmp3
++	srlv	$tmp2,$tmp2,$tmp1
++	or	$in3,$in3,$tmp2
++# else
++	srlv	$in0,$in0,$tmp0
++	sllv	$tmp3,$in1,$tmp1
++	srlv	$in1,$in1,$tmp0
++	or	$in0,$in0,$tmp3
++	sllv	$tmp3,$in2,$tmp1
++	srlv	$in2,$in2,$tmp0
++	or	$in1,$in1,$tmp3
++	sllv	$tmp3,$in3,$tmp1
++	srlv	$in3,$in3,$tmp0
++	or	$in2,$in2,$tmp3
++	sllv	$tmp2,$tmp2,$tmp1
++	or	$in3,$in3,$tmp2
++# endif
++.Laligned_key:
++#else
++	lwl	$in0,0+MSB($inp)
++	lwl	$in1,4+MSB($inp)
++	lwl	$in2,8+MSB($inp)
++	lwl	$in3,12+MSB($inp)
++	lwr	$in0,0+LSB($inp)
++	lwr	$in1,4+LSB($inp)
++	lwr	$in2,8+LSB($inp)
++	lwr	$in3,12+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++	wsbh	$in0,$in0		# byte swap
++	wsbh	$in1,$in1
++	wsbh	$in2,$in2
++	wsbh	$in3,$in3
++	rotr	$in0,$in0,16
++	rotr	$in1,$in1,16
++	rotr	$in2,$in2,16
++	rotr	$in3,$in3,16
++# else
++	srl	$tmp0,$in0,24		# byte swap
++	srl	$tmp1,$in0,8
++	andi	$tmp2,$in0,0xFF00
++	sll	$in0,$in0,24
++	andi	$tmp1,0xFF00
++	sll	$tmp2,$tmp2,8
++	or	$in0,$tmp0
++	 srl	$tmp0,$in1,24
++	or	$tmp1,$tmp2
++	 srl	$tmp2,$in1,8
++	or	$in0,$tmp1
++	 andi	$tmp1,$in1,0xFF00
++	 sll	$in1,$in1,24
++	 andi	$tmp2,0xFF00
++	 sll	$tmp1,$tmp1,8
++	 or	$in1,$tmp0
++	srl	$tmp0,$in2,24
++	 or	$tmp2,$tmp1
++	srl	$tmp1,$in2,8
++	 or	$in1,$tmp2
++	andi	$tmp2,$in2,0xFF00
++	sll	$in2,$in2,24
++	andi	$tmp1,0xFF00
++	sll	$tmp2,$tmp2,8
++	or	$in2,$tmp0
++	 srl	$tmp0,$in3,24
++	or	$tmp1,$tmp2
++	 srl	$tmp2,$in3,8
++	or	$in2,$tmp1
++	 andi	$tmp1,$in3,0xFF00
++	 sll	$in3,$in3,24
++	 andi	$tmp2,0xFF00
++	 sll	$tmp1,$tmp1,8
++	 or	$in3,$tmp0
++	 or	$tmp2,$tmp1
++	 or	$in3,$tmp2
++# endif
++#endif
++	lui	$tmp0,0x0fff
++	ori	$tmp0,0xffff		# 0x0fffffff
++	and	$in0,$in0,$tmp0
++	subu	$tmp0,3			# 0x0ffffffc
++	and	$in1,$in1,$tmp0
++	and	$in2,$in2,$tmp0
++	and	$in3,$in3,$tmp0
++
++	sw	$in0,20($ctx)
++	sw	$in1,24($ctx)
++	sw	$in2,28($ctx)
++	sw	$in3,32($ctx)
++
++	srl	$tmp1,$in1,2
++	srl	$tmp2,$in2,2
++	srl	$tmp3,$in3,2
++	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
++	addu	$in2,$in2,$tmp2
++	addu	$in3,$in3,$tmp3
++	sw	$in1,36($ctx)
++	sw	$in2,40($ctx)
++	sw	$in3,44($ctx)
++.Lno_key:
++	li	$v0,0
++	jr	$ra
++.end	poly1305_init
++___
++{
++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
++
++my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
++   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
++my ($d0,$d1,$d2,$d3) =
++   ($a4,$a5,$a6,$a7);
++my $shr = $t2;		# used on R6
++my $one = $t2;		# used on R2
++
++$code.=<<___;
++.globl	poly1305_blocks
++.align	5
++.ent	poly1305_blocks
++poly1305_blocks:
++	.frame	$sp,16*4,$ra
++	.mask	$SAVED_REGS_MASK,-4
++	.set	noreorder
++	subu	$sp, $sp,4*12
++	sw	$s11,4*11($sp)
++	sw	$s10,4*10($sp)
++	sw	$s9, 4*9($sp)
++	sw	$s8, 4*8($sp)
++	sw	$s7, 4*7($sp)
++	sw	$s6, 4*6($sp)
++	sw	$s5, 4*5($sp)
++	sw	$s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	sw	$s3, 4*3($sp)
++	sw	$s2, 4*2($sp)
++	sw	$s1, 4*1($sp)
++	sw	$s0, 4*0($sp)
++___
++$code.=<<___;
++	.set	reorder
++
++	srl	$len,4			# number of complete blocks
++	li	$one,1
++	beqz	$len,.Labort
++
++#if defined(_MIPS_ARCH_MIPS32R6)
++	andi	$shr,$inp,3
++	subu	$inp,$inp,$shr		# align $inp
++	sll	$shr,$shr,3		# byte to bit offset
++#endif
++
++	lw	$h0,0($ctx)		# load hash value
++	lw	$h1,4($ctx)
++	lw	$h2,8($ctx)
++	lw	$h3,12($ctx)
++	lw	$h4,16($ctx)
++
++	lw	$r0,20($ctx)		# load key
++	lw	$r1,24($ctx)
++	lw	$r2,28($ctx)
++	lw	$r3,32($ctx)
++	lw	$rs1,36($ctx)
++	lw	$rs2,40($ctx)
++	lw	$rs3,44($ctx)
++
++	sll	$len,4
++	addu	$len,$len,$inp		# end of buffer
++	b	.Loop
++
++.align	4
++.Loop:
++#if defined(_MIPS_ARCH_MIPS32R6)
++	lw	$d0,0($inp)		# load input
++	lw	$d1,4($inp)
++	lw	$d2,8($inp)
++	lw	$d3,12($inp)
++	beqz	$shr,.Laligned_inp
++
++	lw	$t0,16($inp)
++	subu	$t1,$zero,$shr
++# ifdef	MIPSEB
++	sllv	$d0,$d0,$shr
++	srlv	$at,$d1,$t1
++	sllv	$d1,$d1,$shr
++	or	$d0,$d0,$at
++	srlv	$at,$d2,$t1
++	sllv	$d2,$d2,$shr
++	or	$d1,$d1,$at
++	srlv	$at,$d3,$t1
++	sllv	$d3,$d3,$shr
++	or	$d2,$d2,$at
++	srlv	$t0,$t0,$t1
++	or	$d3,$d3,$t0
++# else
++	srlv	$d0,$d0,$shr
++	sllv	$at,$d1,$t1
++	srlv	$d1,$d1,$shr
++	or	$d0,$d0,$at
++	sllv	$at,$d2,$t1
++	srlv	$d2,$d2,$shr
++	or	$d1,$d1,$at
++	sllv	$at,$d3,$t1
++	srlv	$d3,$d3,$shr
++	or	$d2,$d2,$at
++	sllv	$t0,$t0,$t1
++	or	$d3,$d3,$t0
++# endif
++.Laligned_inp:
++#else
++	lwl	$d0,0+MSB($inp)		# load input
++	lwl	$d1,4+MSB($inp)
++	lwl	$d2,8+MSB($inp)
++	lwl	$d3,12+MSB($inp)
++	lwr	$d0,0+LSB($inp)
++	lwr	$d1,4+LSB($inp)
++	lwr	$d2,8+LSB($inp)
++	lwr	$d3,12+LSB($inp)
++#endif
++#ifdef	MIPSEB
++# if defined(_MIPS_ARCH_MIPS32R2)
++	wsbh	$d0,$d0			# byte swap
++	wsbh	$d1,$d1
++	wsbh	$d2,$d2
++	wsbh	$d3,$d3
++	rotr	$d0,$d0,16
++	rotr	$d1,$d1,16
++	rotr	$d2,$d2,16
++	rotr	$d3,$d3,16
++# else
++	srl	$at,$d0,24		# byte swap
++	srl	$t0,$d0,8
++	andi	$t1,$d0,0xFF00
++	sll	$d0,$d0,24
++	andi	$t0,0xFF00
++	sll	$t1,$t1,8
++	or	$d0,$at
++	 srl	$at,$d1,24
++	or	$t0,$t1
++	 srl	$t1,$d1,8
++	or	$d0,$t0
++	 andi	$t0,$d1,0xFF00
++	 sll	$d1,$d1,24
++	 andi	$t1,0xFF00
++	 sll	$t0,$t0,8
++	 or	$d1,$at
++	srl	$at,$d2,24
++	 or	$t1,$t0
++	srl	$t0,$d2,8
++	 or	$d1,$t1
++	andi	$t1,$d2,0xFF00
++	sll	$d2,$d2,24
++	andi	$t0,0xFF00
++	sll	$t1,$t1,8
++	or	$d2,$at
++	 srl	$at,$d3,24
++	or	$t0,$t1
++	 srl	$t1,$d3,8
++	or	$d2,$t0
++	 andi	$t0,$d3,0xFF00
++	 sll	$d3,$d3,24
++	 andi	$t1,0xFF00
++	 sll	$t0,$t0,8
++	 or	$d3,$at
++	 or	$t1,$t0
++	 or	$d3,$t1
++# endif
++#endif
++	srl	$t0,$h4,2		# modulo-scheduled reduction
++	andi	$h4,$h4,3
++	sll	$at,$t0,2
++
++	addu	$d0,$d0,$h0		# accumulate input
++	 addu	$t0,$t0,$at
++	sltu	$h0,$d0,$h0
++	addu	$d0,$d0,$t0		# ... and residue
++	sltu	$at,$d0,$t0
++
++	addu	$d1,$d1,$h1
++	 addu	$h0,$h0,$at		# carry
++	sltu	$h1,$d1,$h1
++	addu	$d1,$d1,$h0
++	sltu	$h0,$d1,$h0
++
++	addu	$d2,$d2,$h2
++	 addu	$h1,$h1,$h0		# carry
++	sltu	$h2,$d2,$h2
++	addu	$d2,$d2,$h1
++	sltu	$h1,$d2,$h1
++
++	addu	$d3,$d3,$h3
++	 addu	$h2,$h2,$h1		# carry
++	sltu	$h3,$d3,$h3
++	addu	$d3,$d3,$h2
++
++#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
++	multu	$r0,$d0			# d0*r0
++	 sltu	$h2,$d3,$h2
++	maddu	$rs3,$d1		# d1*s3
++	 addu	$h3,$h3,$h2		# carry
++	maddu	$rs2,$d2		# d2*s2
++	 addu	$h4,$h4,$padbit
++	maddu	$rs1,$d3		# d3*s1
++	 addu	$h4,$h4,$h3
++	mfhi	$at
++	mflo	$h0
++
++	multu	$r1,$d0			# d0*r1
++	maddu	$r0,$d1			# d1*r0
++	maddu	$rs3,$d2		# d2*s3
++	maddu	$rs2,$d3		# d3*s2
++	maddu	$rs1,$h4		# h4*s1
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h1
++
++	multu	$r2,$d0			# d0*r2
++	maddu	$r1,$d1			# d1*r1
++	maddu	$r0,$d2			# d2*r0
++	maddu	$rs3,$d3		# d3*s3
++	maddu	$rs2,$h4		# h4*s2
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h2
++
++	mul	$t0,$r0,$h4		# h4*r0
++
++	multu	$r3,$d0			# d0*r3
++	maddu	$r2,$d1			# d1*r2
++	maddu	$r1,$d2			# d2*r1
++	maddu	$r0,$d3			# d3*r0
++	maddu	$rs3,$h4		# h4*s3
++	maddu	$at,$one		# hi*1
++	mfhi	$at
++	mflo	$h3
++
++	 addiu	$inp,$inp,16
++
++	addu	$h4,$t0,$at
++#else
++	multu	($r0,$d0)		# d0*r0
++	mflo	($h0,$r0,$d0)
++	mfhi	($h1,$r0,$d0)
++
++	 sltu	$h2,$d3,$h2
++	 addu	$h3,$h3,$h2		# carry
++
++	multu	($rs3,$d1)		# d1*s3
++	mflo	($at,$rs3,$d1)
++	mfhi	($t0,$rs3,$d1)
++
++	 addu	$h4,$h4,$padbit
++	 addiu	$inp,$inp,16
++	 addu	$h4,$h4,$h3
++
++	multu	($rs2,$d2)		# d2*s2
++	mflo	($a3,$rs2,$d2)
++	mfhi	($t1,$rs2,$d2)
++	 addu	$h0,$h0,$at
++	 addu	$h1,$h1,$t0
++	multu	($rs1,$d3)		# d3*s1
++	 sltu	$at,$h0,$at
++	 addu	$h1,$h1,$at
++
++	mflo	($at,$rs1,$d3)
++	mfhi	($t0,$rs1,$d3)
++	 addu	$h0,$h0,$a3
++	 addu	$h1,$h1,$t1
++	multu	($r1,$d0)		# d0*r1
++	 sltu	$a3,$h0,$a3
++	 addu	$h1,$h1,$a3
++
++
++	mflo	($a3,$r1,$d0)
++	mfhi	($h2,$r1,$d0)
++	 addu	$h0,$h0,$at
++	 addu	$h1,$h1,$t0
++	multu	($r0,$d1)		# d1*r0
++	 sltu	$at,$h0,$at
++	 addu	$h1,$h1,$at
++
++	mflo	($at,$r0,$d1)
++	mfhi	($t0,$r0,$d1)
++	 addu	$h1,$h1,$a3
++	 sltu	$a3,$h1,$a3
++	multu	($rs3,$d2)		# d2*s3
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$rs3,$d2)
++	mfhi	($t1,$rs3,$d2)
++	 addu	$h1,$h1,$at
++	 addu	$h2,$h2,$t0
++	multu	($rs2,$d3)		# d3*s2
++	 sltu	$at,$h1,$at
++	 addu	$h2,$h2,$at
++
++	mflo	($at,$rs2,$d3)
++	mfhi	($t0,$rs2,$d3)
++	 addu	$h1,$h1,$a3
++	 addu	$h2,$h2,$t1
++	multu	($rs1,$h4)		# h4*s1
++	 sltu	$a3,$h1,$a3
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$rs1,$h4)
++	 addu	$h1,$h1,$at
++	 addu	$h2,$h2,$t0
++	multu	($r2,$d0)		# d0*r2
++	 sltu	$at,$h1,$at
++	 addu	$h2,$h2,$at
++
++
++	mflo	($at,$r2,$d0)
++	mfhi	($h3,$r2,$d0)
++	 addu	$h1,$h1,$a3
++	 sltu	$a3,$h1,$a3
++	multu	($r1,$d1)		# d1*r1
++	 addu	$h2,$h2,$a3
++
++	mflo	($a3,$r1,$d1)
++	mfhi	($t1,$r1,$d1)
++	 addu	$h2,$h2,$at
++	 sltu	$at,$h2,$at
++	multu	($r0,$d2)		# d2*r0
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$r0,$d2)
++	mfhi	($t0,$r0,$d2)
++	 addu	$h2,$h2,$a3
++	 addu	$h3,$h3,$t1
++	multu	($rs3,$d3)		# d3*s3
++	 sltu	$a3,$h2,$a3
++	 addu	$h3,$h3,$a3
++
++	mflo	($a3,$rs3,$d3)
++	mfhi	($t1,$rs3,$d3)
++	 addu	$h2,$h2,$at
++	 addu	$h3,$h3,$t0
++	multu	($rs2,$h4)		# h4*s2
++	 sltu	$at,$h2,$at
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$rs2,$h4)
++	 addu	$h2,$h2,$a3
++	 addu	$h3,$h3,$t1
++	multu	($r3,$d0)		# d0*r3
++	 sltu	$a3,$h2,$a3
++	 addu	$h3,$h3,$a3
++
++
++	mflo	($a3,$r3,$d0)
++	mfhi	($t1,$r3,$d0)
++	 addu	$h2,$h2,$at
++	 sltu	$at,$h2,$at
++	multu	($r2,$d1)		# d1*r2
++	 addu	$h3,$h3,$at
++
++	mflo	($at,$r2,$d1)
++	mfhi	($t0,$r2,$d1)
++	 addu	$h3,$h3,$a3
++	 sltu	$a3,$h3,$a3
++	multu	($r0,$d3)		# d3*r0
++	 addu	$t1,$t1,$a3
++
++	mflo	($a3,$r0,$d3)
++	mfhi	($d3,$r0,$d3)
++	 addu	$h3,$h3,$at
++	 addu	$t1,$t1,$t0
++	multu	($r1,$d2)		# d2*r1
++	 sltu	$at,$h3,$at
++	 addu	$t1,$t1,$at
++
++	mflo	($at,$r1,$d2)
++	mfhi	($t0,$r1,$d2)
++	 addu	$h3,$h3,$a3
++	 addu	$t1,$t1,$d3
++	multu	($rs3,$h4)		# h4*s3
++	 sltu	$a3,$h3,$a3
++	 addu	$t1,$t1,$a3
++
++	mflo	($a3,$rs3,$h4)
++	 addu	$h3,$h3,$at
++	 addu	$t1,$t1,$t0
++	multu	($r0,$h4)		# h4*r0
++	 sltu	$at,$h3,$at
++	 addu	$t1,$t1,$at
++
++
++	mflo	($h4,$r0,$h4)
++	 addu	$h3,$h3,$a3
++	 sltu	$a3,$h3,$a3
++	 addu	$t1,$t1,$a3
++	addu	$h4,$h4,$t1
++
++	li	$padbit,1		# if we loop, padbit is 1
++#endif
++	bne	$inp,$len,.Loop
++
++	sw	$h0,0($ctx)		# store hash value
++	sw	$h1,4($ctx)
++	sw	$h2,8($ctx)
++	sw	$h3,12($ctx)
++	sw	$h4,16($ctx)
++
++	.set	noreorder
++.Labort:
++	lw	$s11,4*11($sp)
++	lw	$s10,4*10($sp)
++	lw	$s9, 4*9($sp)
++	lw	$s8, 4*8($sp)
++	lw	$s7, 4*7($sp)
++	lw	$s6, 4*6($sp)
++	lw	$s5, 4*5($sp)
++	lw	$s4, 4*4($sp)
++___
++$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
++	lw	$s3, 4*3($sp)
++	lw	$s2, 4*2($sp)
++	lw	$s1, 4*1($sp)
++	lw	$s0, 4*0($sp)
++___
++$code.=<<___;
++	jr	$ra
++	addu	$sp,$sp,4*12
++.end	poly1305_blocks
++___
++}
++{
++my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
++
++$code.=<<___;
++.align	5
++.globl	poly1305_emit
++.ent	poly1305_emit
++poly1305_emit:
++	.frame	$sp,0,$ra
++	.set	reorder
++
++	lw	$tmp4,16($ctx)
++	lw	$tmp0,0($ctx)
++	lw	$tmp1,4($ctx)
++	lw	$tmp2,8($ctx)
++	lw	$tmp3,12($ctx)
++
++	li	$in0,-4			# final reduction
++	srl	$ctx,$tmp4,2
++	and	$in0,$in0,$tmp4
++	andi	$tmp4,$tmp4,3
++	addu	$ctx,$ctx,$in0
++
++	addu	$tmp0,$tmp0,$ctx
++	sltu	$ctx,$tmp0,$ctx
++	 addiu	$in0,$tmp0,5		# compare to modulus
++	addu	$tmp1,$tmp1,$ctx
++	 sltiu	$in1,$in0,5
++	sltu	$ctx,$tmp1,$ctx
++	 addu	$in1,$in1,$tmp1
++	addu	$tmp2,$tmp2,$ctx
++	 sltu	$in2,$in1,$tmp1
++	sltu	$ctx,$tmp2,$ctx
++	 addu	$in2,$in2,$tmp2
++	addu	$tmp3,$tmp3,$ctx
++	 sltu	$in3,$in2,$tmp2
++	sltu	$ctx,$tmp3,$ctx
++	 addu	$in3,$in3,$tmp3
++	addu	$tmp4,$tmp4,$ctx
++	 sltu	$ctx,$in3,$tmp3
++	 addu	$ctx,$tmp4
++
++	srl	$ctx,2			# see if it carried/borrowed
++	subu	$ctx,$zero,$ctx
++
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	xor	$in2,$tmp2
++	xor	$in3,$tmp3
++	and	$in0,$ctx
++	and	$in1,$ctx
++	and	$in2,$ctx
++	and	$in3,$ctx
++	xor	$in0,$tmp0
++	xor	$in1,$tmp1
++	xor	$in2,$tmp2
++	xor	$in3,$tmp3
++
++	lw	$tmp0,0($nonce)		# load nonce
++	lw	$tmp1,4($nonce)
++	lw	$tmp2,8($nonce)
++	lw	$tmp3,12($nonce)
++
++	addu	$in0,$tmp0		# accumulate nonce
++	sltu	$ctx,$in0,$tmp0
++
++	addu	$in1,$tmp1
++	sltu	$tmp1,$in1,$tmp1
++	addu	$in1,$ctx
++	sltu	$ctx,$in1,$ctx
++	addu	$ctx,$tmp1
++
++	addu	$in2,$tmp2
++	sltu	$tmp2,$in2,$tmp2
++	addu	$in2,$ctx
++	sltu	$ctx,$in2,$ctx
++	addu	$ctx,$tmp2
++
++	addu	$in3,$tmp3
++	addu	$in3,$ctx
++
++	srl	$tmp0,$in0,8		# write mac value
++	srl	$tmp1,$in0,16
++	srl	$tmp2,$in0,24
++	sb	$in0, 0($mac)
++	sb	$tmp0,1($mac)
++	srl	$tmp0,$in1,8
++	sb	$tmp1,2($mac)
++	srl	$tmp1,$in1,16
++	sb	$tmp2,3($mac)
++	srl	$tmp2,$in1,24
++	sb	$in1, 4($mac)
++	sb	$tmp0,5($mac)
++	srl	$tmp0,$in2,8
++	sb	$tmp1,6($mac)
++	srl	$tmp1,$in2,16
++	sb	$tmp2,7($mac)
++	srl	$tmp2,$in2,24
++	sb	$in2, 8($mac)
++	sb	$tmp0,9($mac)
++	srl	$tmp0,$in3,8
++	sb	$tmp1,10($mac)
++	srl	$tmp1,$in3,16
++	sb	$tmp2,11($mac)
++	srl	$tmp2,$in3,24
++	sb	$in3, 12($mac)
++	sb	$tmp0,13($mac)
++	sb	$tmp1,14($mac)
++	sb	$tmp2,15($mac)
++
++	jr	$ra
++.end	poly1305_emit
++.rdata
++.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
++.align	2
++___
++}
++}}}
++
++$output=pop and open STDOUT,">$output";
++print $code;
++close STDOUT;
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -707,6 +707,11 @@ config CRYPTO_POLY1305_X86_64
+ 	  in IETF protocols. This is the x86_64 assembler implementation using SIMD
+ 	  instructions.
+ 
++config CRYPTO_POLY1305_MIPS
++	tristate "Poly1305 authenticator algorithm (MIPS optimized)"
++	depends on CPU_MIPS32 || (CPU_MIPS64 && 64BIT)
++	select CRYPTO_ARCH_HAVE_LIB_POLY1305
++
+ config CRYPTO_MD4
+ 	tristate "MD4 digest algorithm"
+ 	select CRYPTO_HASH
+--- a/lib/crypto/Kconfig
++++ b/lib/crypto/Kconfig
+@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
+ 
+ config CRYPTO_LIB_POLY1305_RSIZE
+ 	int
++	default 2 if MIPS
+ 	default 4 if X86_64
+ 	default 9 if ARM || ARM64
+ 	default 1
author	Jason A. Donenfeld <Jason@zx2c4.com>	2021-02-19 14:29:04 +0100
committer	David Bauer <mail@david-bauer.net>	2021-02-26 20:41:01 +0100
commit	3888fa78802354ab7bbd19b7d061fd80a16ce06b (patch)
tree	2225a6313cb6482f0cb9c09df662a0d44197350e /target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
parent	7d4143234c4dfdd050ebc64ec8231f9d81ea65af (diff)
download	upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.gz upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.tar.bz2 upstream-3888fa78802354ab7bbd19b7d061fd80a16ce06b.zip