aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch
diff options
context:
space:
mode:
authorDaniel Golle <daniel@makrotopia.org>2022-03-21 01:16:48 +0000
committerDaniel Golle <daniel@makrotopia.org>2022-03-21 13:11:56 +0000
commit786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186 (patch)
tree926fecb2b1f6ce1e42ba7ef4c7aab8e68dfd214c /target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch
parent9470160c350d15f765c33d6c1db15d6c4709a64c (diff)
downloadupstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.tar.gz
upstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.tar.bz2
upstream-786bf7fdaca4c75e7eba6e9aa3a8b5775fd21186.zip
kernel: delete Linux 5.4 config and patches
As the upcoming release will be based on Linux 5.10 only, remove all kernel configuration as well as patches for Linux 5.4. There were no targets still actively using Linux 5.4. Signed-off-by: Daniel Golle <daniel@makrotopia.org> (cherry picked from commit 3a14580411adfb75f9a44eded9f41245b9e44606)
Diffstat (limited to 'target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch')
-rw-r--r--target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch176
1 files changed, 0 insertions, 176 deletions
diff --git a/target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch b/target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch
deleted file mode 100644
index 00ec7d0207..0000000000
--- a/target/linux/generic/backport-5.4/041-v5.5-arm64-Implement-optimised-checksum-routine.patch
+++ /dev/null
@@ -1,176 +0,0 @@
-From: Robin Murphy <robin.murphy@arm.com>
-Date: Wed, 15 Jan 2020 16:42:39 +0000
-Subject: [PATCH] arm64: Implement optimised checksum routine
-
-Apparently there exist certain workloads which rely heavily on software
-checksumming, for which the generic do_csum() implementation becomes a
-significant bottleneck. Therefore let's give arm64 its own optimised
-version - for ease of maintenance this foregoes assembly or intrisics,
-and is thus not actually arm64-specific, but does rely heavily on C
-idioms that translate well to the A64 ISA and the typical load/store
-capabilities of most ARMv8 CPU cores.
-
-The resulting increase in checksum throughput scales nicely with buffer
-size, tending towards 4x for a small in-order core (Cortex-A53), and up
-to 6x or more for an aggressive big core (Ampere eMAG).
-
-Reported-by: Lingyan Huang <huanglingyan2@huawei.com>
-Tested-by: Lingyan Huang <huanglingyan2@huawei.com>
-Signed-off-by: Robin Murphy <robin.murphy@arm.com>
-Signed-off-by: Will Deacon <will@kernel.org>
----
- create mode 100644 arch/arm64/lib/csum.c
-
---- a/arch/arm64/include/asm/checksum.h
-+++ b/arch/arm64/include/asm/checksum.h
-@@ -36,6 +36,9 @@ static inline __sum16 ip_fast_csum(const
- }
- #define ip_fast_csum ip_fast_csum
-
-+extern unsigned int do_csum(const unsigned char *buff, int len);
-+#define do_csum do_csum
-+
- #include <asm-generic/checksum.h>
-
- #endif /* __ASM_CHECKSUM_H */
---- a/arch/arm64/lib/Makefile
-+++ b/arch/arm64/lib/Makefile
-@@ -1,9 +1,9 @@
- # SPDX-License-Identifier: GPL-2.0
- lib-y := clear_user.o delay.o copy_from_user.o \
- copy_to_user.o copy_in_user.o copy_page.o \
-- clear_page.o memchr.o memcpy.o memmove.o memset.o \
-- memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
-- strchr.o strrchr.o tishift.o
-+ clear_page.o csum.o memchr.o memcpy.o memmove.o \
-+ memset.o memcmp.o strcmp.o strncmp.o strlen.o \
-+ strnlen.o strchr.o strrchr.o tishift.o
-
- ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
- obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
---- /dev/null
-+++ b/arch/arm64/lib/csum.c
-@@ -0,0 +1,123 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+// Copyright (C) 2019-2020 Arm Ltd.
-+
-+#include <linux/compiler.h>
-+#include <linux/kasan-checks.h>
-+#include <linux/kernel.h>
-+
-+#include <net/checksum.h>
-+
-+/* Looks dumb, but generates nice-ish code */
-+static u64 accumulate(u64 sum, u64 data)
-+{
-+ __uint128_t tmp = (__uint128_t)sum + data;
-+ return tmp + (tmp >> 64);
-+}
-+
-+unsigned int do_csum(const unsigned char *buff, int len)
-+{
-+ unsigned int offset, shift, sum;
-+ const u64 *ptr;
-+ u64 data, sum64 = 0;
-+
-+ offset = (unsigned long)buff & 7;
-+ /*
-+ * This is to all intents and purposes safe, since rounding down cannot
-+ * result in a different page or cache line being accessed, and @buff
-+ * should absolutely not be pointing to anything read-sensitive. We do,
-+ * however, have to be careful not to piss off KASAN, which means using
-+ * unchecked reads to accommodate the head and tail, for which we'll
-+ * compensate with an explicit check up-front.
-+ */
-+ kasan_check_read(buff, len);
-+ ptr = (u64 *)(buff - offset);
-+ len = len + offset - 8;
-+
-+ /*
-+ * Head: zero out any excess leading bytes. Shifting back by the same
-+ * amount should be at least as fast as any other way of handling the
-+ * odd/even alignment, and means we can ignore it until the very end.
-+ */
-+ shift = offset * 8;
-+ data = READ_ONCE_NOCHECK(*ptr++);
-+#ifdef __LITTLE_ENDIAN
-+ data = (data >> shift) << shift;
-+#else
-+ data = (data << shift) >> shift;
-+#endif
-+
-+ /*
-+ * Body: straightforward aligned loads from here on (the paired loads
-+ * underlying the quadword type still only need dword alignment). The
-+ * main loop strictly excludes the tail, so the second loop will always
-+ * run at least once.
-+ */
-+ while (unlikely(len > 64)) {
-+ __uint128_t tmp1, tmp2, tmp3, tmp4;
-+
-+ tmp1 = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
-+ tmp2 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 2));
-+ tmp3 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 4));
-+ tmp4 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 6));
-+
-+ len -= 64;
-+ ptr += 8;
-+
-+ /* This is the "don't dump the carry flag into a GPR" idiom */
-+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
-+ tmp2 += (tmp2 >> 64) | (tmp2 << 64);
-+ tmp3 += (tmp3 >> 64) | (tmp3 << 64);
-+ tmp4 += (tmp4 >> 64) | (tmp4 << 64);
-+ tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
-+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
-+ tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
-+ tmp3 += (tmp3 >> 64) | (tmp3 << 64);
-+ tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
-+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
-+ tmp1 = ((tmp1 >> 64) << 64) | sum64;
-+ tmp1 += (tmp1 >> 64) | (tmp1 << 64);
-+ sum64 = tmp1 >> 64;
-+ }
-+ while (len > 8) {
-+ __uint128_t tmp;
-+
-+ sum64 = accumulate(sum64, data);
-+ tmp = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
-+
-+ len -= 16;
-+ ptr += 2;
-+
-+#ifdef __LITTLE_ENDIAN
-+ data = tmp >> 64;
-+ sum64 = accumulate(sum64, tmp);
-+#else
-+ data = tmp;
-+ sum64 = accumulate(sum64, tmp >> 64);
-+#endif
-+ }
-+ if (len > 0) {
-+ sum64 = accumulate(sum64, data);
-+ data = READ_ONCE_NOCHECK(*ptr);
-+ len -= 8;
-+ }
-+ /*
-+ * Tail: zero any over-read bytes similarly to the head, again
-+ * preserving odd/even alignment.
-+ */
-+ shift = len * -8;
-+#ifdef __LITTLE_ENDIAN
-+ data = (data << shift) >> shift;
-+#else
-+ data = (data >> shift) << shift;
-+#endif
-+ sum64 = accumulate(sum64, data);
-+
-+ /* Finally, folding */
-+ sum64 += (sum64 >> 32) | (sum64 << 32);
-+ sum = sum64 >> 32;
-+ sum += (sum >> 16) | (sum << 16);
-+ if (offset & 1)
-+ return (u16)swab32(sum);
-+
-+ return sum >> 16;
-+}