brcm2708: rename target to bcm27xx

This change makes the names of Broadcom targets consistent by using the common notation based on SoC/CPU ID (which is used internally anyway), bcmXXXX instead of brcmXXXX. This is even used for target TITLE in make menuconfig already, only the short target name used brcm so far. Despite, since subtargets range from bcm2708 to bcm2711, it seems appropriate to use bcm27xx instead of bcm2708 (again, as already done for BOARDNAME). This also renames the packages brcm2708-userland and brcm2708-gpu-fw. Signed-off-by: Adrian Schmutzler <freifunk@adrianschmutzler.de> Acked-by: Álvaro Fernández Rojas <noltari@gmail.com>
author: Adrian Schmutzler <freifunk@adrianschmutzler.de> 2020-02-08 21:58:55 +0100
committer: Adrian Schmutzler <freifunk@adrianschmutzler.de> 2020-02-14 14:10:51 +0100
commit: 7d7aa2fd924c27829ec25f825481554dd81bce97 (patch)
tree: 658b87b89331670266163e522ea5fb52535633cb /target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
parent: e7bfda2c243e66a75ff966ba04c28b1590b5d24c (diff)
download: upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.gz
upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.bz2
upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.zip
1 files changed, 1549 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch b/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
new file mode 100644
index 0000000000..430059cddf
--- /dev/null
+++ b/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
@@ -0,0 +1,1549 @@
+From e4d81adf1f2c84b229901cddb403d00010524b28 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 28 Nov 2016 16:50:04 +0000
+Subject: [PATCH] Improve __copy_to_user and __copy_from_user
+ performance
+
+Provide a __copy_from_user that uses memcpy. On BCM2708, use
+optimised memcpy/memmove/memcmp/memset implementations.
+
+arch/arm: Add mmiocpy/set aliases for memcpy/set
+
+See: https://github.com/raspberrypi/linux/issues/1082
+
+copy_from_user: CPU_SW_DOMAIN_PAN compatibility
+
+The downstream copy_from_user acceleration must also play nice with
+CONFIG_CPU_SW_DOMAIN_PAN.
+
+See: https://github.com/raspberrypi/linux/issues/1381
+
+Signed-off-by: Phil Elwell <phil@raspberrypi.org>
+---
+ arch/arm/include/asm/string.h      |   5 +
+ arch/arm/include/asm/uaccess.h     |   3 +
+ arch/arm/lib/Makefile              |  14 +-
+ arch/arm/lib/arm-mem.h             | 159 +++++++++
+ arch/arm/lib/copy_from_user.S      |   4 +-
+ arch/arm/lib/exports_rpi.c         |  37 +++
+ arch/arm/lib/memcmp_rpi.S          | 285 ++++++++++++++++
+ arch/arm/lib/memcpy_rpi.S          |  61 ++++
+ arch/arm/lib/memcpymove.h          | 506 +++++++++++++++++++++++++++++
+ arch/arm/lib/memmove_rpi.S         |  61 ++++
+ arch/arm/lib/memset_rpi.S          | 128 ++++++++
+ arch/arm/lib/uaccess_with_memcpy.c | 120 ++++++-
+ arch/arm/mach-bcm/Kconfig          |   7 +
+ 13 files changed, 1385 insertions(+), 5 deletions(-)
+ create mode 100644 arch/arm/lib/arm-mem.h
+ create mode 100644 arch/arm/lib/exports_rpi.c
+ create mode 100644 arch/arm/lib/memcmp_rpi.S
+ create mode 100644 arch/arm/lib/memcpy_rpi.S
+ create mode 100644 arch/arm/lib/memcpymove.h
+ create mode 100644 arch/arm/lib/memmove_rpi.S
+ create mode 100644 arch/arm/lib/memset_rpi.S
+
+--- a/arch/arm/include/asm/string.h
++++ b/arch/arm/include/asm/string.h
+@@ -39,4 +39,9 @@ static inline void *memset64(uint64_t *p
+ 	return __memset64(p, v, n * 8, v >> 32);
+ }
+ 
++#ifdef CONFIG_BCM2835_FAST_MEMCPY
++#define __HAVE_ARCH_MEMCMP
++extern int memcmp(const void *, const void *, size_t);
++#endif
++
+ #endif
+--- a/arch/arm/include/asm/uaccess.h
++++ b/arch/arm/include/asm/uaccess.h
+@@ -514,6 +514,9 @@ do {									\
+ extern unsigned long __must_check
+ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
+ 
++extern unsigned long __must_check
++__copy_from_user_std(void *to, const void __user *from, unsigned long n);
++
+ static inline unsigned long __must_check
+ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+ {
+--- a/arch/arm/lib/Makefile
++++ b/arch/arm/lib/Makefile
+@@ -7,8 +7,8 @@
+ 
+ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
+ 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
+-		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
+-		   memmove.o memset.o setbit.o                        \
++		   delay.o delay-loop.o findbit.o memchr.o            \
++		   setbit.o                                           \
+ 		   strchr.o strrchr.o                                 \
+ 		   testchangebit.o testclearbit.o testsetbit.o        \
+ 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
+@@ -19,6 +19,16 @@ lib-y		:= backtrace.o changebit.o csumip
+ mmu-y		:= clear_user.o copy_page.o getuser.o putuser.o       \
+ 		   copy_from_user.o copy_to_user.o
+ 
++# Choose optimised implementations for Raspberry Pi
++ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
++  CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
++  CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
++  obj-$(CONFIG_MODULES) += exports_rpi.o
++  lib-y        += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
++else
++  lib-y        += memcpy.o memmove.o memset.o
++endif
++
+ # using lib_ here won't override already available weak symbols
+ obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
+ 
+--- /dev/null
++++ b/arch/arm/lib/arm-mem.h
+@@ -0,0 +1,159 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro myfunc fname
++ .func fname
++ .global fname
++fname:
++.endm
++
++.macro preload_leading_step1  backwards, ptr, base
++/* If the destination is already 16-byte aligned, then we need to preload
++ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
++ * are no gaps when the inner loop starts.
++ */
++ .if backwards
++        sub     ptr, base, #1
++        bic     ptr, ptr, #31
++ .else
++        bic     ptr, base, #31
++ .endif
++ .set OFFSET, 0
++ .rept prefetch_distance+1
++        pld     [ptr, #OFFSET]
++  .if backwards
++   .set OFFSET, OFFSET-32
++  .else
++   .set OFFSET, OFFSET+32
++  .endif
++ .endr
++.endm
++
++.macro preload_leading_step2  backwards, ptr, base, leading_bytes, tmp
++/* However, if the destination is not 16-byte aligned, we may need to
++ * preload one more cache line than that. The question we need to ask is:
++ * are the leading bytes more than the amount by which the source
++ * pointer will be rounded down for preloading, and if so, by how many
++ * cache lines?
++ */
++ .if backwards
++/* Here we compare against how many bytes we are into the
++ * cache line, counting down from the highest such address.
++ * Effectively, we want to calculate
++ *     leading_bytes = dst&15
++ *     cacheline_offset = 31-((src-leading_bytes-1)&31)
++ *     extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0, or rearranging:
++ *     leading_bytes + (src-leading_bytes-1)&31 <= 31
++ */
++        mov     tmp, base, lsl #32-5
++        sbc     tmp, tmp, leading_bytes, lsl #32-5
++        adds    tmp, tmp, leading_bytes, lsl #32-5
++        bcc     61f
++        pld     [ptr, #-32*(prefetch_distance+1)]
++ .else
++/* Effectively, we want to calculate
++ *     leading_bytes = (-dst)&15
++ *     cacheline_offset = (src+leading_bytes)&31
++ *     extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0.
++ */
++        mov     tmp, base, lsl #32-5
++        add     tmp, tmp, leading_bytes, lsl #32-5
++        rsbs    tmp, tmp, leading_bytes, lsl #32-5
++        bls     61f
++        pld     [ptr, #32*(prefetch_distance+1)]
++ .endif
++61:
++.endm
++
++.macro preload_trailing  backwards, base, remain, tmp
++        /* We need either 0, 1 or 2 extra preloads */
++ .if backwards
++        rsb     tmp, base, #0
++        mov     tmp, tmp, lsl #32-5
++ .else
++        mov     tmp, base, lsl #32-5
++ .endif
++        adds    tmp, tmp, remain, lsl #32-5
++        adceqs  tmp, tmp, #0
++        /* The instruction above has two effects: ensures Z is only
++         * set if C was clear (so Z indicates that both shifted quantities
++         * were 0), and clears C if Z was set (so C indicates that the sum
++         * of the shifted quantities was greater and not equal to 32) */
++        beq     82f
++ .if backwards
++        sub     tmp, base, #1
++        bic     tmp, tmp, #31
++ .else
++        bic     tmp, base, #31
++ .endif
++        bcc     81f
++ .if backwards
++        pld     [tmp, #-32*(prefetch_distance+1)]
++81:
++        pld     [tmp, #-32*prefetch_distance]
++ .else
++        pld     [tmp, #32*(prefetch_distance+2)]
++81:
++        pld     [tmp, #32*(prefetch_distance+1)]
++ .endif
++82:
++.endm
++
++.macro preload_all    backwards, narrow_case, shift, base, remain, tmp0, tmp1
++ .if backwards
++        sub     tmp0, base, #1
++        bic     tmp0, tmp0, #31
++        pld     [tmp0]
++        sub     tmp1, base, remain, lsl #shift
++ .else
++        bic     tmp0, base, #31
++        pld     [tmp0]
++        add     tmp1, base, remain, lsl #shift
++        sub     tmp1, tmp1, #1
++ .endif
++        bic     tmp1, tmp1, #31
++        cmp     tmp1, tmp0
++        beq     92f
++ .if narrow_case
++        /* In this case, all the data fits in either 1 or 2 cache lines */
++        pld     [tmp1]
++ .else
++91:
++  .if backwards
++        sub     tmp0, tmp0, #32
++  .else
++        add     tmp0, tmp0, #32
++  .endif
++        cmp     tmp0, tmp1
++        pld     [tmp0]
++        bne     91b
++ .endif
++92:
++.endm
+--- a/arch/arm/lib/copy_from_user.S
++++ b/arch/arm/lib/copy_from_user.S
+@@ -89,7 +89,8 @@
+ 
+ 	.text
+ 
+-ENTRY(arm_copy_from_user)
++ENTRY(__copy_from_user_std)
++WEAK(arm_copy_from_user)
+ #ifdef CONFIG_CPU_SPECTRE
+ 	get_thread_info r3
+ 	ldr	r3, [r3, #TI_ADDR_LIMIT]
+@@ -99,6 +100,7 @@ ENTRY(arm_copy_from_user)
+ #include "copy_template.S"
+ 
+ ENDPROC(arm_copy_from_user)
++ENDPROC(__copy_from_user_std)
+ 
+ 	.pushsection .fixup,"ax"
+ 	.align 0
+--- /dev/null
++++ b/arch/arm/lib/exports_rpi.c
+@@ -0,0 +1,37 @@
++/**
++ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions, and the following disclaimer,
++ *    without modification.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ * 3. The names of the above-listed copyright holders may not be used
++ *    to endorse or promote products derived from this software without
++ *    specific prior written permission.
++ *
++ * ALTERNATIVELY, this software may be distributed under the terms of the
++ * GNU General Public License ("GPL") version 2, as published by the Free
++ * Software Foundation.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
++ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
++ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++EXPORT_SYMBOL(memcmp);
+--- /dev/null
++++ b/arch/arm/lib/memcmp_rpi.S
+@@ -0,0 +1,285 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++.macro memcmp_process_head  unaligned
++ .if unaligned
++        ldr     DAT0, [S_1], #4
++        ldr     DAT1, [S_1], #4
++        ldr     DAT2, [S_1], #4
++        ldr     DAT3, [S_1], #4
++ .else
++        ldmia   S_1!, {DAT0, DAT1, DAT2, DAT3}
++ .endif
++        ldmia   S_2!, {DAT4, DAT5, DAT6, DAT7}
++.endm
++
++.macro memcmp_process_tail
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        cmpeq   DAT3, DAT7
++        bne     200f
++.endm
++
++.macro memcmp_leading_31bytes
++        movs    DAT0, OFF, lsl #31
++        ldrmib  DAT0, [S_1], #1
++        ldrcsh  DAT1, [S_1], #2
++        ldrmib  DAT4, [S_2], #1
++        ldrcsh  DAT5, [S_2], #2
++        movpl   DAT0, #0
++        movcc   DAT1, #0
++        movpl   DAT4, #0
++        movcc   DAT5, #0
++        submi   N, N, #1
++        subcs   N, N, #2
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        bne     200f
++        movs    DAT0, OFF, lsl #29
++        ldrmi   DAT0, [S_1], #4
++        ldrcs   DAT1, [S_1], #4
++        ldrcs   DAT2, [S_1], #4
++        ldrmi   DAT4, [S_2], #4
++        ldmcsia S_2!, {DAT5, DAT6}
++        movpl   DAT0, #0
++        movcc   DAT1, #0
++        movcc   DAT2, #0
++        movpl   DAT4, #0
++        movcc   DAT5, #0
++        movcc   DAT6, #0
++        submi   N, N, #4
++        subcs   N, N, #8
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        bne     200f
++        tst     OFF, #16
++        beq     105f
++        memcmp_process_head  1
++        sub     N, N, #16
++        memcmp_process_tail
++105:
++.endm
++
++.macro memcmp_trailing_15bytes  unaligned
++        movs    N, N, lsl #29
++ .if unaligned
++        ldrcs   DAT0, [S_1], #4
++        ldrcs   DAT1, [S_1], #4
++ .else
++        ldmcsia S_1!, {DAT0, DAT1}
++ .endif
++        ldrmi   DAT2, [S_1], #4
++        ldmcsia S_2!, {DAT4, DAT5}
++        ldrmi   DAT6, [S_2], #4
++        movcc   DAT0, #0
++        movcc   DAT1, #0
++        movpl   DAT2, #0
++        movcc   DAT4, #0
++        movcc   DAT5, #0
++        movpl   DAT6, #0
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        cmpeq   DAT2, DAT6
++        bne     200f
++        movs    N, N, lsl #2
++        ldrcsh  DAT0, [S_1], #2
++        ldrmib  DAT1, [S_1]
++        ldrcsh  DAT4, [S_2], #2
++        ldrmib  DAT5, [S_2]
++        movcc   DAT0, #0
++        movpl   DAT1, #0
++        movcc   DAT4, #0
++        movpl   DAT5, #0
++        cmp     DAT0, DAT4
++        cmpeq   DAT1, DAT5
++        bne     200f
++.endm
++
++.macro memcmp_long_inner_loop  unaligned
++110:
++        memcmp_process_head  unaligned
++        pld     [S_2, #prefetch_distance*32 + 16]
++        memcmp_process_tail
++        memcmp_process_head  unaligned
++        pld     [S_1, OFF]
++        memcmp_process_tail
++        subs    N, N, #32
++        bhs     110b
++        /* Just before the final (prefetch_distance+1) 32-byte blocks,
++         * deal with final preloads */
++        preload_trailing  0, S_1, N, DAT0
++        preload_trailing  0, S_2, N, DAT0
++        add     N, N, #(prefetch_distance+2)*32 - 16
++120:
++        memcmp_process_head  unaligned
++        memcmp_process_tail
++        subs    N, N, #16
++        bhs     120b
++        /* Trailing words and bytes */
++        tst     N, #15
++        beq     199f
++        memcmp_trailing_15bytes  unaligned
++199:    /* Reached end without detecting a difference */
++        mov     a1, #0
++        setend  le
++        pop     {DAT1-DAT6, pc}
++.endm
++
++.macro memcmp_short_inner_loop  unaligned
++        subs    N, N, #16     /* simplifies inner loop termination */
++        blo     122f
++120:
++        memcmp_process_head  unaligned
++        memcmp_process_tail
++        subs    N, N, #16
++        bhs     120b
++122:    /* Trailing words and bytes */
++        tst     N, #15
++        beq     199f
++        memcmp_trailing_15bytes  unaligned
++199:    /* Reached end without detecting a difference */
++        mov     a1, #0
++        setend  le
++        pop     {DAT1-DAT6, pc}
++.endm
++
++/*
++ * int memcmp(const void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to buffer 1
++ * a2 = pointer to buffer 2
++ * a3 = number of bytes to compare (as unsigned chars)
++ * On exit:
++ * a1 = >0/=0/<0 if s1 >/=/< s2
++ */
++
++.set prefetch_distance, 2
++
++ENTRY(memcmp)
++        S_1     .req    a1
++        S_2     .req    a2
++        N       .req    a3
++        DAT0    .req    a4
++        DAT1    .req    v1
++        DAT2    .req    v2
++        DAT3    .req    v3
++        DAT4    .req    v4
++        DAT5    .req    v5
++        DAT6    .req    v6
++        DAT7    .req    ip
++        OFF     .req    lr
++
++        push    {DAT1-DAT6, lr}
++        setend  be /* lowest-addressed bytes are most significant */
++
++        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++        cmp     N, #(prefetch_distance+3)*32 - 1
++        blo     170f
++
++        /* Long case */
++        /* Adjust N so that the decrement instruction can also test for
++         * inner loop termination. We want it to stop when there are
++         * (prefetch_distance+1) complete blocks to go. */
++        sub     N, N, #(prefetch_distance+2)*32
++        preload_leading_step1  0, DAT0, S_1
++        preload_leading_step1  0, DAT1, S_2
++        tst     S_2, #31
++        beq     154f
++        rsb     OFF, S_2, #0 /* no need to AND with 15 here */
++        preload_leading_step2  0, DAT0, S_1, OFF, DAT2
++        preload_leading_step2  0, DAT1, S_2, OFF, DAT2
++        memcmp_leading_31bytes
++154:    /* Second source now cacheline (32-byte) aligned; we have at
++         * least one prefetch to go. */
++        /* Prefetch offset is best selected such that it lies in the
++         * first 8 of each 32 bytes - but it's just as easy to aim for
++         * the first one */
++        and     OFF, S_1, #31
++        rsb     OFF, OFF, #32*prefetch_distance
++        tst     S_1, #3
++        bne     140f
++        memcmp_long_inner_loop  0
++140:    memcmp_long_inner_loop  1
++
++170:    /* Short case */
++        teq     N, #0
++        beq     199f
++        preload_all 0, 0, 0, S_1, N, DAT0, DAT1
++        preload_all 0, 0, 0, S_2, N, DAT0, DAT1
++        tst     S_2, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199f
++        ldrb    DAT0, [S_1], #1
++        ldrb    DAT4, [S_2], #1
++        cmp     DAT0, DAT4
++        bne     200f
++        tst     S_2, #3
++        bne     172b
++174:    /* Second source now 4-byte aligned; we have 0 or more bytes to go */
++        tst     S_1, #3
++        bne     140f
++        memcmp_short_inner_loop  0
++140:    memcmp_short_inner_loop  1
++
++200:    /* Difference found: determine sign. */
++        movhi   a1, #1
++        movlo   a1, #-1
++        setend  le
++        pop     {DAT1-DAT6, pc}
++
++        .unreq  S_1
++        .unreq  S_2
++        .unreq  N
++        .unreq  DAT0
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++        .unreq  DAT4
++        .unreq  DAT5
++        .unreq  DAT6
++        .unreq  DAT7
++        .unreq  OFF
++ENDPROC(memcmp)
+--- /dev/null
++++ b/arch/arm/lib/memcpy_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(mmiocpy)
++ENTRY(memcpy)
++        memcpy  0
++ENDPROC(memcpy)
++ENDPROC(mmiocpy)
+--- /dev/null
++++ b/arch/arm/lib/memcpymove.h
+@@ -0,0 +1,506 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro unaligned_words  backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
++ .if words == 1
++  .if backwards
++        mov     r1, r0, lsl #32-align*8
++        ldr     r0, [S, #-4]!
++        orr     r1, r1, r0, lsr #align*8
++        str     r1, [D, #-4]!
++  .else
++        mov     r0, r1, lsr #align*8
++        ldr     r1, [S, #4]!
++        orr     r0, r0, r1, lsl #32-align*8
++        str     r0, [D], #4
++  .endif
++ .elseif words == 2
++  .if backwards
++        ldr     r1, [S, #-4]!
++        mov     r2, r0, lsl #32-align*8
++        ldr     r0, [S, #-4]!
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r1, r2}
++  .else
++        ldr     r1, [S, #4]!
++        mov     r0, r2, lsr #align*8
++        ldr     r2, [S, #4]!
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        stmia   D!, {r0, r1}
++  .endif
++ .elseif words == 4
++  .if backwards
++        ldmdb   S!, {r2, r3}
++        mov     r4, r0, lsl #32-align*8
++        ldmdb   S!, {r0, r1}
++        orr     r4, r4, r3, lsr #align*8
++        mov     r3, r3, lsl #32-align*8
++        orr     r3, r3, r2, lsr #align*8
++        mov     r2, r2, lsl #32-align*8
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r1, r2, r3, r4}
++  .else
++        ldmib   S!, {r1, r2}
++        mov     r0, r4, lsr #align*8
++        ldmib   S!, {r3, r4}
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        mov     r2, r2, lsr #align*8
++        orr     r2, r2, r3, lsl #32-align*8
++        mov     r3, r3, lsr #align*8
++        orr     r3, r3, r4, lsl #32-align*8
++        stmia   D!, {r0, r1, r2, r3}
++  .endif
++ .elseif words == 8
++  .if backwards
++        ldmdb   S!, {r4, r5, r6, r7}
++        mov     r8, r0, lsl #32-align*8
++        ldmdb   S!, {r0, r1, r2, r3}
++   .if use_pld
++        pld     [S, OFF]
++   .endif
++        orr     r8, r8, r7, lsr #align*8
++        mov     r7, r7, lsl #32-align*8
++        orr     r7, r7, r6, lsr #align*8
++        mov     r6, r6, lsl #32-align*8
++        orr     r6, r6, r5, lsr #align*8
++        mov     r5, r5, lsl #32-align*8
++        orr     r5, r5, r4, lsr #align*8
++        mov     r4, r4, lsl #32-align*8
++        orr     r4, r4, r3, lsr #align*8
++        mov     r3, r3, lsl #32-align*8
++        orr     r3, r3, r2, lsr #align*8
++        mov     r2, r2, lsl #32-align*8
++        orr     r2, r2, r1, lsr #align*8
++        mov     r1, r1, lsl #32-align*8
++        orr     r1, r1, r0, lsr #align*8
++        stmdb   D!, {r5, r6, r7, r8}
++        stmdb   D!, {r1, r2, r3, r4}
++  .else
++        ldmib   S!, {r1, r2, r3, r4}
++        mov     r0, r8, lsr #align*8
++        ldmib   S!, {r5, r6, r7, r8}
++   .if use_pld
++        pld     [S, OFF]
++   .endif
++        orr     r0, r0, r1, lsl #32-align*8
++        mov     r1, r1, lsr #align*8
++        orr     r1, r1, r2, lsl #32-align*8
++        mov     r2, r2, lsr #align*8
++        orr     r2, r2, r3, lsl #32-align*8
++        mov     r3, r3, lsr #align*8
++        orr     r3, r3, r4, lsl #32-align*8
++        mov     r4, r4, lsr #align*8
++        orr     r4, r4, r5, lsl #32-align*8
++        mov     r5, r5, lsr #align*8
++        orr     r5, r5, r6, lsl #32-align*8
++        mov     r6, r6, lsr #align*8
++        orr     r6, r6, r7, lsl #32-align*8
++        mov     r7, r7, lsr #align*8
++        orr     r7, r7, r8, lsl #32-align*8
++        stmia   D!, {r0, r1, r2, r3}
++        stmia   D!, {r4, r5, r6, r7}
++  .endif
++ .endif
++.endm
++
++.macro memcpy_leading_15bytes  backwards, align
++        movs    DAT1, DAT2, lsl #31
++        sub     N, N, DAT2
++ .if backwards
++        ldrmib  DAT0, [S, #-1]!
++        ldrcsh  DAT1, [S, #-2]!
++        strmib  DAT0, [D, #-1]!
++        strcsh  DAT1, [D, #-2]!
++ .else
++        ldrmib  DAT0, [S], #1
++        ldrcsh  DAT1, [S], #2
++        strmib  DAT0, [D], #1
++        strcsh  DAT1, [D], #2
++ .endif
++        movs    DAT1, DAT2, lsl #29
++ .if backwards
++        ldrmi   DAT0, [S, #-4]!
++  .if align == 0
++        ldmcsdb S!, {DAT1, DAT2}
++  .else
++        ldrcs   DAT2, [S, #-4]!
++        ldrcs   DAT1, [S, #-4]!
++  .endif
++        strmi   DAT0, [D, #-4]!
++        stmcsdb D!, {DAT1, DAT2}
++ .else
++        ldrmi   DAT0, [S], #4
++  .if align == 0
++        ldmcsia S!, {DAT1, DAT2}
++  .else
++        ldrcs   DAT1, [S], #4
++        ldrcs   DAT2, [S], #4
++  .endif
++        strmi   DAT0, [D], #4
++        stmcsia D!, {DAT1, DAT2}
++ .endif
++.endm
++
++.macro memcpy_trailing_15bytes  backwards, align
++        movs    N, N, lsl #29
++ .if backwards
++  .if align == 0
++        ldmcsdb S!, {DAT0, DAT1}
++  .else
++        ldrcs   DAT1, [S, #-4]!
++        ldrcs   DAT0, [S, #-4]!
++  .endif
++        ldrmi   DAT2, [S, #-4]!
++        stmcsdb D!, {DAT0, DAT1}
++        strmi   DAT2, [D, #-4]!
++ .else
++  .if align == 0
++        ldmcsia S!, {DAT0, DAT1}
++  .else
++        ldrcs   DAT0, [S], #4
++        ldrcs   DAT1, [S], #4
++  .endif
++        ldrmi   DAT2, [S], #4
++        stmcsia D!, {DAT0, DAT1}
++        strmi   DAT2, [D], #4
++ .endif
++        movs    N, N, lsl #2
++ .if backwards
++        ldrcsh  DAT0, [S, #-2]!
++        ldrmib  DAT1, [S, #-1]
++        strcsh  DAT0, [D, #-2]!
++        strmib  DAT1, [D, #-1]
++ .else
++        ldrcsh  DAT0, [S], #2
++        ldrmib  DAT1, [S]
++        strcsh  DAT0, [D], #2
++        strmib  DAT1, [D]
++ .endif
++.endm
++
++.macro memcpy_long_inner_loop  backwards, align
++ .if align != 0
++  .if backwards
++        ldr     DAT0, [S, #-align]!
++  .else
++        ldr     LAST, [S, #-align]!
++  .endif
++ .endif
++110:
++ .if align == 0
++  .if backwards
++        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        pld     [S, OFF]
++        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
++  .else
++        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        pld     [S, OFF]
++        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
++        stmia   D!, {DAT4, DAT5, DAT6, LAST}
++  .endif
++ .else
++        unaligned_words  backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++        subs    N, N, #32
++        bhs     110b
++        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
++        preload_trailing  backwards, S, N, OFF
++        add     N, N, #(prefetch_distance+2)*32 - 32
++120:
++ .if align == 0
++  .if backwards
++        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
++        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
++  .else
++        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
++        stmia   D!, {DAT4, DAT5, DAT6, LAST}
++  .endif
++ .else
++        unaligned_words  backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++        subs    N, N, #32
++        bhs     120b
++        tst     N, #16
++ .if align == 0
++  .if backwards
++        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++        stmneia D!, {DAT0, DAT1, DAT2, LAST}
++  .endif
++ .else
++        beq     130f
++        unaligned_words  backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
++130:
++ .endif
++        /* Trailing words and bytes */
++        tst      N, #15
++        beq      199f
++ .if align != 0
++        add     S, S, #align
++ .endif
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {DAT3, DAT4, DAT5, DAT6, DAT7}
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_medium_inner_loop  backwards, align
++120:
++ .if backwards
++  .if align == 0
++        ldmdb   S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldr     LAST, [S, #-4]!
++        ldr     DAT2, [S, #-4]!
++        ldr     DAT1, [S, #-4]!
++        ldr     DAT0, [S, #-4]!
++  .endif
++        stmdb   D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++  .if align == 0
++        ldmia   S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldr     DAT0, [S], #4
++        ldr     DAT1, [S], #4
++        ldr     DAT2, [S], #4
++        ldr     LAST, [S], #4
++  .endif
++        stmia   D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++        subs     N, N, #16
++        bhs      120b
++        /* Trailing words and bytes */
++        tst      N, #15
++        beq      199f
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_short_inner_loop  backwards, align
++        tst     N, #16
++ .if backwards
++  .if align == 0
++        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldrne   LAST, [S, #-4]!
++        ldrne   DAT2, [S, #-4]!
++        ldrne   DAT1, [S, #-4]!
++        ldrne   DAT0, [S, #-4]!
++  .endif
++        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++  .if align == 0
++        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++  .else
++        ldrne   DAT0, [S], #4
++        ldrne   DAT1, [S], #4
++        ldrne   DAT2, [S], #4
++        ldrne   LAST, [S], #4
++  .endif
++        stmneia D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++        memcpy_trailing_15bytes  backwards, align
++199:
++        pop     {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy backwards
++        D       .req    a1
++        S       .req    a2
++        N       .req    a3
++        DAT0    .req    a4
++        DAT1    .req    v1
++        DAT2    .req    v2
++        DAT3    .req    v3
++        DAT4    .req    v4
++        DAT5    .req    v5
++        DAT6    .req    v6
++        DAT7    .req    sl
++        LAST    .req    ip
++        OFF     .req    lr
++
++        .cfi_startproc
++
++        push    {D, DAT1, DAT2, lr}
++
++        .cfi_def_cfa_offset 16
++        .cfi_rel_offset D, 0
++        .cfi_undefined  S
++        .cfi_undefined  N
++        .cfi_undefined  DAT0
++        .cfi_rel_offset DAT1, 4
++        .cfi_rel_offset DAT2, 8
++        .cfi_undefined  LAST
++        .cfi_rel_offset lr, 12
++
++ .if backwards
++        add     D, D, N
++        add     S, S, N
++ .endif
++
++        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++        cmp     N, #31
++        blo     170f
++        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++        cmp     N, #(prefetch_distance+3)*32 - 1
++        blo     160f
++
++        /* Long case */
++        push    {DAT3, DAT4, DAT5, DAT6, DAT7}
++
++        .cfi_def_cfa_offset 36
++        .cfi_rel_offset D, 20
++        .cfi_rel_offset DAT1, 24
++        .cfi_rel_offset DAT2, 28
++        .cfi_rel_offset DAT3, 0
++        .cfi_rel_offset DAT4, 4
++        .cfi_rel_offset DAT5, 8
++        .cfi_rel_offset DAT6, 12
++        .cfi_rel_offset DAT7, 16
++        .cfi_rel_offset lr, 32
++
++        /* Adjust N so that the decrement instruction can also test for
++         * inner loop termination. We want it to stop when there are
++         * (prefetch_distance+1) complete blocks to go. */
++        sub     N, N, #(prefetch_distance+2)*32
++        preload_leading_step1  backwards, DAT0, S
++ .if backwards
++        /* Bug in GAS: it accepts, but mis-assembles the instruction
++         * ands    DAT2, D, #60, 2
++         * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
++         */
++        .word   0xE210513C
++        beq     154f
++ .else
++        ands    DAT2, D, #15
++        beq     154f
++        rsb     DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
++ .endif
++        preload_leading_step2  backwards, DAT0, S, DAT2, OFF
++        memcpy_leading_15bytes backwards, 1
++154:    /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
++        /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
++ .if backwards
++        rsb     OFF, S, #3
++        and     OFF, OFF, #28
++        sub     OFF, OFF, #32*(prefetch_distance+1)
++ .else
++        and     OFF, S, #28
++        rsb     OFF, OFF, #32*prefetch_distance
++ .endif
++        movs    DAT0, S, lsl #31
++        bhi     157f
++        bcs     156f
++        bmi     155f
++        memcpy_long_inner_loop  backwards, 0
++155:    memcpy_long_inner_loop  backwards, 1
++156:    memcpy_long_inner_loop  backwards, 2
++157:    memcpy_long_inner_loop  backwards, 3
++
++        .cfi_def_cfa_offset 16
++        .cfi_rel_offset D, 0
++        .cfi_rel_offset DAT1, 4
++        .cfi_rel_offset DAT2, 8
++        .cfi_same_value DAT3
++        .cfi_same_value DAT4
++        .cfi_same_value DAT5
++        .cfi_same_value DAT6
++        .cfi_same_value DAT7
++        .cfi_rel_offset lr, 12
++
++160:    /* Medium case */
++        preload_all  backwards, 0, 0, S, N, DAT2, OFF
++        sub     N, N, #16     /* simplifies inner loop termination */
++ .if backwards
++        ands    DAT2, D, #15
++        beq     164f
++ .else
++        ands    DAT2, D, #15
++        beq     164f
++        rsb     DAT2, DAT2, #16
++ .endif
++        memcpy_leading_15bytes backwards, align
++164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
++        tst     S, #3
++        bne     140f
++        memcpy_medium_inner_loop  backwards, 0
++140:    memcpy_medium_inner_loop  backwards, 1
++
++170:    /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
++        teq     N, #0
++        beq     199f
++        preload_all  backwards, 1, 0, S, N, DAT2, LAST
++        tst     D, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199f
++ .if backwards
++        ldrb    DAT0, [S, #-1]!
++        strb    DAT0, [D, #-1]!
++ .else
++        ldrb    DAT0, [S], #1
++        strb    DAT0, [D], #1
++ .endif
++        tst     D, #3
++        bne     172b
++174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
++        tst     S, #3
++        bne     140f
++        memcpy_short_inner_loop  backwards, 0
++140:    memcpy_short_inner_loop  backwards, 1
++
++        .cfi_endproc
++
++        .unreq  D
++        .unreq  S
++        .unreq  N
++        .unreq  DAT0
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++        .unreq  DAT4
++        .unreq  DAT5
++        .unreq  DAT6
++        .unreq  DAT7
++        .unreq  LAST
++        .unreq  OFF
++.endm
+--- /dev/null
++++ b/arch/arm/lib/memmove_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ * void *memmove(void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(memmove)
++        cmp     a2, a1
++        bpl     memcpy  /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
++        memcpy  1
++ENDPROC(memmove)
+--- /dev/null
++++ b/arch/arm/lib/memset_rpi.S
+@@ -0,0 +1,128 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++    .text
++    .arch armv6
++    .object_arch armv4
++    .arm
++    .altmacro
++    .p2align 2
++
++/*
++ *  void *memset(void *s, int c, size_t n);
++ *  On entry:
++ *  a1 = pointer to buffer to fill
++ *  a2 = byte pattern to fill with (caller-narrowed)
++ *  a3 = number of bytes to fill
++ *  On exit:
++ *  a1 preserved
++ */
++ENTRY(mmioset)
++ENTRY(memset)
++ENTRY(__memset32)
++ENTRY(__memset64)
++
++        S       .req    a1
++        DAT0    .req    a2
++        N       .req    a3
++        DAT1    .req    a4
++        DAT2    .req    ip
++        DAT3    .req    lr
++
++        orr     DAT0, DAT0, DAT0, lsl #8
++        push    {S, lr}
++        orr     DAT0, DAT0, DAT0, lsl #16
++        mov     DAT1, DAT0
++
++        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++        cmp     N, #31
++        blo     170f
++
++161:    sub     N, N, #16     /* simplifies inner loop termination */
++        /* Leading words and bytes */
++        tst     S, #15
++        beq     164f
++        rsb     DAT3, S, #0   /* bits 0-3 = number of leading bytes until aligned */
++        movs    DAT2, DAT3, lsl #31
++        submi   N, N, #1
++        strmib  DAT0, [S], #1
++        subcs   N, N, #2
++        strcsh  DAT0, [S], #2
++        movs    DAT2, DAT3, lsl #29
++        submi   N, N, #4
++        strmi   DAT0, [S], #4
++        subcs   N, N, #8
++        stmcsia S!, {DAT0, DAT1}
++164:    /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
++        mov     DAT2, DAT0
++        mov     DAT3, DAT0
++        /* Now the inner loop of 16-byte stores */
++165:    stmia   S!, {DAT0, DAT1, DAT2, DAT3}
++        subs    N, N, #16
++        bhs     165b
++166:    /* Trailing words and bytes */
++        movs    N, N, lsl #29
++        stmcsia S!, {DAT0, DAT1}
++        strmi   DAT0, [S], #4
++        movs    N, N, lsl #2
++        strcsh  DAT0, [S], #2
++        strmib  DAT0, [S]
++199:    pop     {S, pc}
++
++170:    /* Short case */
++        mov     DAT2, DAT0
++        mov     DAT3, DAT0
++        tst     S, #3
++        beq     174f
++172:    subs    N, N, #1
++        blo     199b
++        strb    DAT0, [S], #1
++        tst     S, #3
++        bne     172b
++174:    tst     N, #16
++        stmneia S!, {DAT0, DAT1, DAT2, DAT3}
++        b       166b
++
++        .unreq  S
++        .unreq  DAT0
++        .unreq  N
++        .unreq  DAT1
++        .unreq  DAT2
++        .unreq  DAT3
++ENDPROC(__memset64)
++ENDPROC(__memset32)
++ENDPROC(memset)
++ENDPROC(mmioset)
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -22,6 +22,14 @@
+ #include <asm/current.h>
+ #include <asm/page.h>
+ 
++#ifndef COPY_FROM_USER_THRESHOLD
++#define COPY_FROM_USER_THRESHOLD 64
++#endif
++
++#ifndef COPY_TO_USER_THRESHOLD
++#define COPY_TO_USER_THRESHOLD 64
++#endif
++
+ static int
+ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ {
+@@ -84,7 +92,44 @@ pin_page_for_write(const void __user *_a
+ 	return 1;
+ }
+ 
+-static unsigned long noinline
++static int
++pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
++{
++	unsigned long addr = (unsigned long)_addr;
++	pgd_t *pgd;
++	pmd_t *pmd;
++	pte_t *pte;
++	pud_t *pud;
++	spinlock_t *ptl;
++
++	pgd = pgd_offset(current->mm, addr);
++	if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
++	{
++		return 0;
++	}
++	pud = pud_offset(pgd, addr);
++	if (unlikely(pud_none(*pud) || pud_bad(*pud)))
++	{
++		return 0;
++	}
++
++	pmd = pmd_offset(pud, addr);
++	if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
++		return 0;
++
++	pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
++	if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
++		pte_unmap_unlock(pte, ptl);
++		return 0;
++	}
++
++	*ptep = pte;
++	*ptlp = ptl;
++
++	return 1;
++}
++
++unsigned long noinline
+ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+ 	unsigned long ua_flags;
+@@ -137,6 +182,57 @@ out:
+ 	return n;
+ }
+ 
++unsigned long noinline
++__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
++{
++	unsigned long ua_flags;
++	int atomic;
++
++	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
++		memcpy(to, (const void *)from, n);
++		return 0;
++	}
++
++	/* the mmap semaphore is taken only if not in an atomic context */
++	atomic = in_atomic();
++
++	if (!atomic)
++		down_read(&current->mm->mmap_sem);
++	while (n) {
++		pte_t *pte;
++		spinlock_t *ptl;
++		int tocopy;
++
++		while (!pin_page_for_read(from, &pte, &ptl)) {
++			char temp;
++			if (!atomic)
++				up_read(&current->mm->mmap_sem);
++			if (__get_user(temp, (char __user *)from))
++				goto out;
++			if (!atomic)
++				down_read(&current->mm->mmap_sem);
++		}
++
++		tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
++		if (tocopy > n)
++			tocopy = n;
++
++		ua_flags = uaccess_save_and_enable();
++		memcpy(to, (const void *)from, tocopy);
++		uaccess_restore(ua_flags);
++		to += tocopy;
++		from += tocopy;
++		n -= tocopy;
++
++		pte_unmap_unlock(pte, ptl);
++	}
++	if (!atomic)
++		up_read(&current->mm->mmap_sem);
++
++out:
++	return n;
++}
++
+ unsigned long
+ arm_copy_to_user(void __user *to, const void *from, unsigned long n)
+ {
+@@ -147,7 +243,7 @@ arm_copy_to_user(void __user *to, const
+ 	 * With frame pointer disabled, tail call optimization kicks in
+ 	 * as well making this test almost invisible.
+ 	 */
+-	if (n < 64) {
++	if (n < COPY_TO_USER_THRESHOLD) {
+ 		unsigned long ua_flags = uaccess_save_and_enable();
+ 		n = __copy_to_user_std(to, from, n);
+ 		uaccess_restore(ua_flags);
+@@ -157,6 +253,26 @@ arm_copy_to_user(void __user *to, const
+ 	}
+ 	return n;
+ }
++
++unsigned long __must_check
++arm_copy_from_user(void *to, const void __user *from, unsigned long n)
++{
++	/*
++	 * This test is stubbed out of the main function above to keep
++	 * the overhead for small copies low by avoiding a large
++	 * register dump on the stack just to reload them right away.
++	 * With frame pointer disabled, tail call optimization kicks in
++	 * as well making this test almost invisible.
++	 */
++	if (n < COPY_TO_USER_THRESHOLD) {
++		unsigned long ua_flags = uaccess_save_and_enable();
++		n = __copy_from_user_std(to, from, n);
++		uaccess_restore(ua_flags);
++	} else {
++		n = __copy_from_user_memcpy(to, from, n);
++	}
++	return n;
++}
+ 	
+ static unsigned long noinline
+ __clear_user_memset(void __user *addr, unsigned long n)
+--- a/arch/arm/mach-bcm/Kconfig
++++ b/arch/arm/mach-bcm/Kconfig
+@@ -187,6 +187,13 @@ config ARCH_BCM_53573
+ 	  The base chip is BCM53573 and there are some packaging modifications
+ 	  like BCM47189 and BCM47452.
+ 
++config BCM2835_FAST_MEMCPY
++	bool "Enable optimized __copy_to_user and __copy_from_user"
++	depends on ARCH_BCM2835 && ARCH_MULTI_V6
++	default y
++	help
++	  Optimized versions of __copy_to_user and __copy_from_user for Pi1.
++
+ config ARCH_BCM_63XX
+ 	bool "Broadcom BCM63xx DSL SoC"
+ 	depends on ARCH_MULTI_V7
author	Adrian Schmutzler <freifunk@adrianschmutzler.de>	2020-02-08 21:58:55 +0100
committer	Adrian Schmutzler <freifunk@adrianschmutzler.de>	2020-02-14 14:10:51 +0100
commit	7d7aa2fd924c27829ec25f825481554dd81bce97 (patch)
tree	658b87b89331670266163e522ea5fb52535633cb /target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
parent	e7bfda2c243e66a75ff966ba04c28b1590b5d24c (diff)
download	upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.gz upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.bz2 upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.zip