aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
diff options
context:
space:
mode:
authorAdrian Schmutzler <freifunk@adrianschmutzler.de>2020-02-08 21:58:55 +0100
committerAdrian Schmutzler <freifunk@adrianschmutzler.de>2020-02-14 14:10:51 +0100
commit7d7aa2fd924c27829ec25f825481554dd81bce97 (patch)
tree658b87b89331670266163e522ea5fb52535633cb /target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
parente7bfda2c243e66a75ff966ba04c28b1590b5d24c (diff)
downloadupstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.gz
upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.tar.bz2
upstream-7d7aa2fd924c27829ec25f825481554dd81bce97.zip
brcm2708: rename target to bcm27xx
This change makes the names of Broadcom targets consistent by using the common notation based on SoC/CPU ID (which is used internally anyway), bcmXXXX instead of brcmXXXX. This is even used for target TITLE in make menuconfig already, only the short target name used brcm so far. Despite, since subtargets range from bcm2708 to bcm2711, it seems appropriate to use bcm27xx instead of bcm2708 (again, as already done for BOARDNAME). This also renames the packages brcm2708-userland and brcm2708-gpu-fw. Signed-off-by: Adrian Schmutzler <freifunk@adrianschmutzler.de> Acked-by: Álvaro Fernández Rojas <noltari@gmail.com>
Diffstat (limited to 'target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch')
-rw-r--r--target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch1549
1 files changed, 1549 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch b/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
new file mode 100644
index 0000000000..430059cddf
--- /dev/null
+++ b/target/linux/bcm27xx/patches-4.19/950-0061-Improve-__copy_to_user-and-__copy_from_user-performa.patch
@@ -0,0 +1,1549 @@
+From e4d81adf1f2c84b229901cddb403d00010524b28 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 28 Nov 2016 16:50:04 +0000
+Subject: [PATCH] Improve __copy_to_user and __copy_from_user
+ performance
+
+Provide a __copy_from_user that uses memcpy. On BCM2708, use
+optimised memcpy/memmove/memcmp/memset implementations.
+
+arch/arm: Add mmiocpy/set aliases for memcpy/set
+
+See: https://github.com/raspberrypi/linux/issues/1082
+
+copy_from_user: CPU_SW_DOMAIN_PAN compatibility
+
+The downstream copy_from_user acceleration must also play nice with
+CONFIG_CPU_SW_DOMAIN_PAN.
+
+See: https://github.com/raspberrypi/linux/issues/1381
+
+Signed-off-by: Phil Elwell <phil@raspberrypi.org>
+---
+ arch/arm/include/asm/string.h | 5 +
+ arch/arm/include/asm/uaccess.h | 3 +
+ arch/arm/lib/Makefile | 14 +-
+ arch/arm/lib/arm-mem.h | 159 +++++++++
+ arch/arm/lib/copy_from_user.S | 4 +-
+ arch/arm/lib/exports_rpi.c | 37 +++
+ arch/arm/lib/memcmp_rpi.S | 285 ++++++++++++++++
+ arch/arm/lib/memcpy_rpi.S | 61 ++++
+ arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++
+ arch/arm/lib/memmove_rpi.S | 61 ++++
+ arch/arm/lib/memset_rpi.S | 128 ++++++++
+ arch/arm/lib/uaccess_with_memcpy.c | 120 ++++++-
+ arch/arm/mach-bcm/Kconfig | 7 +
+ 13 files changed, 1385 insertions(+), 5 deletions(-)
+ create mode 100644 arch/arm/lib/arm-mem.h
+ create mode 100644 arch/arm/lib/exports_rpi.c
+ create mode 100644 arch/arm/lib/memcmp_rpi.S
+ create mode 100644 arch/arm/lib/memcpy_rpi.S
+ create mode 100644 arch/arm/lib/memcpymove.h
+ create mode 100644 arch/arm/lib/memmove_rpi.S
+ create mode 100644 arch/arm/lib/memset_rpi.S
+
+--- a/arch/arm/include/asm/string.h
++++ b/arch/arm/include/asm/string.h
+@@ -39,4 +39,9 @@ static inline void *memset64(uint64_t *p
+ return __memset64(p, v, n * 8, v >> 32);
+ }
+
++#ifdef CONFIG_BCM2835_FAST_MEMCPY
++#define __HAVE_ARCH_MEMCMP
++extern int memcmp(const void *, const void *, size_t);
++#endif
++
+ #endif
+--- a/arch/arm/include/asm/uaccess.h
++++ b/arch/arm/include/asm/uaccess.h
+@@ -514,6 +514,9 @@ do { \
+ extern unsigned long __must_check
+ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
+
++extern unsigned long __must_check
++__copy_from_user_std(void *to, const void __user *from, unsigned long n);
++
+ static inline unsigned long __must_check
+ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+ {
+--- a/arch/arm/lib/Makefile
++++ b/arch/arm/lib/Makefile
+@@ -7,8 +7,8 @@
+
+ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
+ csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
+- delay.o delay-loop.o findbit.o memchr.o memcpy.o \
+- memmove.o memset.o setbit.o \
++ delay.o delay-loop.o findbit.o memchr.o \
++ setbit.o \
+ strchr.o strrchr.o \
+ testchangebit.o testclearbit.o testsetbit.o \
+ ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
+@@ -19,6 +19,16 @@ lib-y := backtrace.o changebit.o csumip
+ mmu-y := clear_user.o copy_page.o getuser.o putuser.o \
+ copy_from_user.o copy_to_user.o
+
++# Choose optimised implementations for Raspberry Pi
++ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
++ obj-$(CONFIG_MODULES) += exports_rpi.o
++ lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
++else
++ lib-y += memcpy.o memmove.o memset.o
++endif
++
+ # using lib_ here won't override already available weak symbols
+ obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
+
+--- /dev/null
++++ b/arch/arm/lib/arm-mem.h
+@@ -0,0 +1,159 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro myfunc fname
++ .func fname
++ .global fname
++fname:
++.endm
++
++.macro preload_leading_step1 backwards, ptr, base
++/* If the destination is already 16-byte aligned, then we need to preload
++ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
++ * are no gaps when the inner loop starts.
++ */
++ .if backwards
++ sub ptr, base, #1
++ bic ptr, ptr, #31
++ .else
++ bic ptr, base, #31
++ .endif
++ .set OFFSET, 0
++ .rept prefetch_distance+1
++ pld [ptr, #OFFSET]
++ .if backwards
++ .set OFFSET, OFFSET-32
++ .else
++ .set OFFSET, OFFSET+32
++ .endif
++ .endr
++.endm
++
++.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
++/* However, if the destination is not 16-byte aligned, we may need to
++ * preload one more cache line than that. The question we need to ask is:
++ * are the leading bytes more than the amount by which the source
++ * pointer will be rounded down for preloading, and if so, by how many
++ * cache lines?
++ */
++ .if backwards
++/* Here we compare against how many bytes we are into the
++ * cache line, counting down from the highest such address.
++ * Effectively, we want to calculate
++ * leading_bytes = dst&15
++ * cacheline_offset = 31-((src-leading_bytes-1)&31)
++ * extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0, or rearranging:
++ * leading_bytes + (src-leading_bytes-1)&31 <= 31
++ */
++ mov tmp, base, lsl #32-5
++ sbc tmp, tmp, leading_bytes, lsl #32-5
++ adds tmp, tmp, leading_bytes, lsl #32-5
++ bcc 61f
++ pld [ptr, #-32*(prefetch_distance+1)]
++ .else
++/* Effectively, we want to calculate
++ * leading_bytes = (-dst)&15
++ * cacheline_offset = (src+leading_bytes)&31
++ * extra_needed = leading_bytes - cacheline_offset
++ * and test if extra_needed is <= 0.
++ */
++ mov tmp, base, lsl #32-5
++ add tmp, tmp, leading_bytes, lsl #32-5
++ rsbs tmp, tmp, leading_bytes, lsl #32-5
++ bls 61f
++ pld [ptr, #32*(prefetch_distance+1)]
++ .endif
++61:
++.endm
++
++.macro preload_trailing backwards, base, remain, tmp
++ /* We need either 0, 1 or 2 extra preloads */
++ .if backwards
++ rsb tmp, base, #0
++ mov tmp, tmp, lsl #32-5
++ .else
++ mov tmp, base, lsl #32-5
++ .endif
++ adds tmp, tmp, remain, lsl #32-5
++ adceqs tmp, tmp, #0
++ /* The instruction above has two effects: ensures Z is only
++ * set if C was clear (so Z indicates that both shifted quantities
++ * were 0), and clears C if Z was set (so C indicates that the sum
++ * of the shifted quantities was greater and not equal to 32) */
++ beq 82f
++ .if backwards
++ sub tmp, base, #1
++ bic tmp, tmp, #31
++ .else
++ bic tmp, base, #31
++ .endif
++ bcc 81f
++ .if backwards
++ pld [tmp, #-32*(prefetch_distance+1)]
++81:
++ pld [tmp, #-32*prefetch_distance]
++ .else
++ pld [tmp, #32*(prefetch_distance+2)]
++81:
++ pld [tmp, #32*(prefetch_distance+1)]
++ .endif
++82:
++.endm
++
++.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
++ .if backwards
++ sub tmp0, base, #1
++ bic tmp0, tmp0, #31
++ pld [tmp0]
++ sub tmp1, base, remain, lsl #shift
++ .else
++ bic tmp0, base, #31
++ pld [tmp0]
++ add tmp1, base, remain, lsl #shift
++ sub tmp1, tmp1, #1
++ .endif
++ bic tmp1, tmp1, #31
++ cmp tmp1, tmp0
++ beq 92f
++ .if narrow_case
++ /* In this case, all the data fits in either 1 or 2 cache lines */
++ pld [tmp1]
++ .else
++91:
++ .if backwards
++ sub tmp0, tmp0, #32
++ .else
++ add tmp0, tmp0, #32
++ .endif
++ cmp tmp0, tmp1
++ pld [tmp0]
++ bne 91b
++ .endif
++92:
++.endm
+--- a/arch/arm/lib/copy_from_user.S
++++ b/arch/arm/lib/copy_from_user.S
+@@ -89,7 +89,8 @@
+
+ .text
+
+-ENTRY(arm_copy_from_user)
++ENTRY(__copy_from_user_std)
++WEAK(arm_copy_from_user)
+ #ifdef CONFIG_CPU_SPECTRE
+ get_thread_info r3
+ ldr r3, [r3, #TI_ADDR_LIMIT]
+@@ -99,6 +100,7 @@ ENTRY(arm_copy_from_user)
+ #include "copy_template.S"
+
+ ENDPROC(arm_copy_from_user)
++ENDPROC(__copy_from_user_std)
+
+ .pushsection .fixup,"ax"
+ .align 0
+--- /dev/null
++++ b/arch/arm/lib/exports_rpi.c
+@@ -0,0 +1,37 @@
++/**
++ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions, and the following disclaimer,
++ * without modification.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. The names of the above-listed copyright holders may not be used
++ * to endorse or promote products derived from this software without
++ * specific prior written permission.
++ *
++ * ALTERNATIVELY, this software may be distributed under the terms of the
++ * GNU General Public License ("GPL") version 2, as published by the Free
++ * Software Foundation.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
++ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
++ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++EXPORT_SYMBOL(memcmp);
+--- /dev/null
++++ b/arch/arm/lib/memcmp_rpi.S
+@@ -0,0 +1,285 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++ .text
++ .arch armv6
++ .object_arch armv4
++ .arm
++ .altmacro
++ .p2align 2
++
++.macro memcmp_process_head unaligned
++ .if unaligned
++ ldr DAT0, [S_1], #4
++ ldr DAT1, [S_1], #4
++ ldr DAT2, [S_1], #4
++ ldr DAT3, [S_1], #4
++ .else
++ ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
++ .endif
++ ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
++.endm
++
++.macro memcmp_process_tail
++ cmp DAT0, DAT4
++ cmpeq DAT1, DAT5
++ cmpeq DAT2, DAT6
++ cmpeq DAT3, DAT7
++ bne 200f
++.endm
++
++.macro memcmp_leading_31bytes
++ movs DAT0, OFF, lsl #31
++ ldrmib DAT0, [S_1], #1
++ ldrcsh DAT1, [S_1], #2
++ ldrmib DAT4, [S_2], #1
++ ldrcsh DAT5, [S_2], #2
++ movpl DAT0, #0
++ movcc DAT1, #0
++ movpl DAT4, #0
++ movcc DAT5, #0
++ submi N, N, #1
++ subcs N, N, #2
++ cmp DAT0, DAT4
++ cmpeq DAT1, DAT5
++ bne 200f
++ movs DAT0, OFF, lsl #29
++ ldrmi DAT0, [S_1], #4
++ ldrcs DAT1, [S_1], #4
++ ldrcs DAT2, [S_1], #4
++ ldrmi DAT4, [S_2], #4
++ ldmcsia S_2!, {DAT5, DAT6}
++ movpl DAT0, #0
++ movcc DAT1, #0
++ movcc DAT2, #0
++ movpl DAT4, #0
++ movcc DAT5, #0
++ movcc DAT6, #0
++ submi N, N, #4
++ subcs N, N, #8
++ cmp DAT0, DAT4
++ cmpeq DAT1, DAT5
++ cmpeq DAT2, DAT6
++ bne 200f
++ tst OFF, #16
++ beq 105f
++ memcmp_process_head 1
++ sub N, N, #16
++ memcmp_process_tail
++105:
++.endm
++
++.macro memcmp_trailing_15bytes unaligned
++ movs N, N, lsl #29
++ .if unaligned
++ ldrcs DAT0, [S_1], #4
++ ldrcs DAT1, [S_1], #4
++ .else
++ ldmcsia S_1!, {DAT0, DAT1}
++ .endif
++ ldrmi DAT2, [S_1], #4
++ ldmcsia S_2!, {DAT4, DAT5}
++ ldrmi DAT6, [S_2], #4
++ movcc DAT0, #0
++ movcc DAT1, #0
++ movpl DAT2, #0
++ movcc DAT4, #0
++ movcc DAT5, #0
++ movpl DAT6, #0
++ cmp DAT0, DAT4
++ cmpeq DAT1, DAT5
++ cmpeq DAT2, DAT6
++ bne 200f
++ movs N, N, lsl #2
++ ldrcsh DAT0, [S_1], #2
++ ldrmib DAT1, [S_1]
++ ldrcsh DAT4, [S_2], #2
++ ldrmib DAT5, [S_2]
++ movcc DAT0, #0
++ movpl DAT1, #0
++ movcc DAT4, #0
++ movpl DAT5, #0
++ cmp DAT0, DAT4
++ cmpeq DAT1, DAT5
++ bne 200f
++.endm
++
++.macro memcmp_long_inner_loop unaligned
++110:
++ memcmp_process_head unaligned
++ pld [S_2, #prefetch_distance*32 + 16]
++ memcmp_process_tail
++ memcmp_process_head unaligned
++ pld [S_1, OFF]
++ memcmp_process_tail
++ subs N, N, #32
++ bhs 110b
++ /* Just before the final (prefetch_distance+1) 32-byte blocks,
++ * deal with final preloads */
++ preload_trailing 0, S_1, N, DAT0
++ preload_trailing 0, S_2, N, DAT0
++ add N, N, #(prefetch_distance+2)*32 - 16
++120:
++ memcmp_process_head unaligned
++ memcmp_process_tail
++ subs N, N, #16
++ bhs 120b
++ /* Trailing words and bytes */
++ tst N, #15
++ beq 199f
++ memcmp_trailing_15bytes unaligned
++199: /* Reached end without detecting a difference */
++ mov a1, #0
++ setend le
++ pop {DAT1-DAT6, pc}
++.endm
++
++.macro memcmp_short_inner_loop unaligned
++ subs N, N, #16 /* simplifies inner loop termination */
++ blo 122f
++120:
++ memcmp_process_head unaligned
++ memcmp_process_tail
++ subs N, N, #16
++ bhs 120b
++122: /* Trailing words and bytes */
++ tst N, #15
++ beq 199f
++ memcmp_trailing_15bytes unaligned
++199: /* Reached end without detecting a difference */
++ mov a1, #0
++ setend le
++ pop {DAT1-DAT6, pc}
++.endm
++
++/*
++ * int memcmp(const void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to buffer 1
++ * a2 = pointer to buffer 2
++ * a3 = number of bytes to compare (as unsigned chars)
++ * On exit:
++ * a1 = >0/=0/<0 if s1 >/=/< s2
++ */
++
++.set prefetch_distance, 2
++
++ENTRY(memcmp)
++ S_1 .req a1
++ S_2 .req a2
++ N .req a3
++ DAT0 .req a4
++ DAT1 .req v1
++ DAT2 .req v2
++ DAT3 .req v3
++ DAT4 .req v4
++ DAT5 .req v5
++ DAT6 .req v6
++ DAT7 .req ip
++ OFF .req lr
++
++ push {DAT1-DAT6, lr}
++ setend be /* lowest-addressed bytes are most significant */
++
++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++ cmp N, #(prefetch_distance+3)*32 - 1
++ blo 170f
++
++ /* Long case */
++ /* Adjust N so that the decrement instruction can also test for
++ * inner loop termination. We want it to stop when there are
++ * (prefetch_distance+1) complete blocks to go. */
++ sub N, N, #(prefetch_distance+2)*32
++ preload_leading_step1 0, DAT0, S_1
++ preload_leading_step1 0, DAT1, S_2
++ tst S_2, #31
++ beq 154f
++ rsb OFF, S_2, #0 /* no need to AND with 15 here */
++ preload_leading_step2 0, DAT0, S_1, OFF, DAT2
++ preload_leading_step2 0, DAT1, S_2, OFF, DAT2
++ memcmp_leading_31bytes
++154: /* Second source now cacheline (32-byte) aligned; we have at
++ * least one prefetch to go. */
++ /* Prefetch offset is best selected such that it lies in the
++ * first 8 of each 32 bytes - but it's just as easy to aim for
++ * the first one */
++ and OFF, S_1, #31
++ rsb OFF, OFF, #32*prefetch_distance
++ tst S_1, #3
++ bne 140f
++ memcmp_long_inner_loop 0
++140: memcmp_long_inner_loop 1
++
++170: /* Short case */
++ teq N, #0
++ beq 199f
++ preload_all 0, 0, 0, S_1, N, DAT0, DAT1
++ preload_all 0, 0, 0, S_2, N, DAT0, DAT1
++ tst S_2, #3
++ beq 174f
++172: subs N, N, #1
++ blo 199f
++ ldrb DAT0, [S_1], #1
++ ldrb DAT4, [S_2], #1
++ cmp DAT0, DAT4
++ bne 200f
++ tst S_2, #3
++ bne 172b
++174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
++ tst S_1, #3
++ bne 140f
++ memcmp_short_inner_loop 0
++140: memcmp_short_inner_loop 1
++
++200: /* Difference found: determine sign. */
++ movhi a1, #1
++ movlo a1, #-1
++ setend le
++ pop {DAT1-DAT6, pc}
++
++ .unreq S_1
++ .unreq S_2
++ .unreq N
++ .unreq DAT0
++ .unreq DAT1
++ .unreq DAT2
++ .unreq DAT3
++ .unreq DAT4
++ .unreq DAT5
++ .unreq DAT6
++ .unreq DAT7
++ .unreq OFF
++ENDPROC(memcmp)
+--- /dev/null
++++ b/arch/arm/lib/memcpy_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++ .text
++ .arch armv6
++ .object_arch armv4
++ .arm
++ .altmacro
++ .p2align 2
++
++/*
++ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(mmiocpy)
++ENTRY(memcpy)
++ memcpy 0
++ENDPROC(memcpy)
++ENDPROC(mmiocpy)
+--- /dev/null
++++ b/arch/arm/lib/memcpymove.h
+@@ -0,0 +1,506 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
++ .if words == 1
++ .if backwards
++ mov r1, r0, lsl #32-align*8
++ ldr r0, [S, #-4]!
++ orr r1, r1, r0, lsr #align*8
++ str r1, [D, #-4]!
++ .else
++ mov r0, r1, lsr #align*8
++ ldr r1, [S, #4]!
++ orr r0, r0, r1, lsl #32-align*8
++ str r0, [D], #4
++ .endif
++ .elseif words == 2
++ .if backwards
++ ldr r1, [S, #-4]!
++ mov r2, r0, lsl #32-align*8
++ ldr r0, [S, #-4]!
++ orr r2, r2, r1, lsr #align*8
++ mov r1, r1, lsl #32-align*8
++ orr r1, r1, r0, lsr #align*8
++ stmdb D!, {r1, r2}
++ .else
++ ldr r1, [S, #4]!
++ mov r0, r2, lsr #align*8
++ ldr r2, [S, #4]!
++ orr r0, r0, r1, lsl #32-align*8
++ mov r1, r1, lsr #align*8
++ orr r1, r1, r2, lsl #32-align*8
++ stmia D!, {r0, r1}
++ .endif
++ .elseif words == 4
++ .if backwards
++ ldmdb S!, {r2, r3}
++ mov r4, r0, lsl #32-align*8
++ ldmdb S!, {r0, r1}
++ orr r4, r4, r3, lsr #align*8
++ mov r3, r3, lsl #32-align*8
++ orr r3, r3, r2, lsr #align*8
++ mov r2, r2, lsl #32-align*8
++ orr r2, r2, r1, lsr #align*8
++ mov r1, r1, lsl #32-align*8
++ orr r1, r1, r0, lsr #align*8
++ stmdb D!, {r1, r2, r3, r4}
++ .else
++ ldmib S!, {r1, r2}
++ mov r0, r4, lsr #align*8
++ ldmib S!, {r3, r4}
++ orr r0, r0, r1, lsl #32-align*8
++ mov r1, r1, lsr #align*8
++ orr r1, r1, r2, lsl #32-align*8
++ mov r2, r2, lsr #align*8
++ orr r2, r2, r3, lsl #32-align*8
++ mov r3, r3, lsr #align*8
++ orr r3, r3, r4, lsl #32-align*8
++ stmia D!, {r0, r1, r2, r3}
++ .endif
++ .elseif words == 8
++ .if backwards
++ ldmdb S!, {r4, r5, r6, r7}
++ mov r8, r0, lsl #32-align*8
++ ldmdb S!, {r0, r1, r2, r3}
++ .if use_pld
++ pld [S, OFF]
++ .endif
++ orr r8, r8, r7, lsr #align*8
++ mov r7, r7, lsl #32-align*8
++ orr r7, r7, r6, lsr #align*8
++ mov r6, r6, lsl #32-align*8
++ orr r6, r6, r5, lsr #align*8
++ mov r5, r5, lsl #32-align*8
++ orr r5, r5, r4, lsr #align*8
++ mov r4, r4, lsl #32-align*8
++ orr r4, r4, r3, lsr #align*8
++ mov r3, r3, lsl #32-align*8
++ orr r3, r3, r2, lsr #align*8
++ mov r2, r2, lsl #32-align*8
++ orr r2, r2, r1, lsr #align*8
++ mov r1, r1, lsl #32-align*8
++ orr r1, r1, r0, lsr #align*8
++ stmdb D!, {r5, r6, r7, r8}
++ stmdb D!, {r1, r2, r3, r4}
++ .else
++ ldmib S!, {r1, r2, r3, r4}
++ mov r0, r8, lsr #align*8
++ ldmib S!, {r5, r6, r7, r8}
++ .if use_pld
++ pld [S, OFF]
++ .endif
++ orr r0, r0, r1, lsl #32-align*8
++ mov r1, r1, lsr #align*8
++ orr r1, r1, r2, lsl #32-align*8
++ mov r2, r2, lsr #align*8
++ orr r2, r2, r3, lsl #32-align*8
++ mov r3, r3, lsr #align*8
++ orr r3, r3, r4, lsl #32-align*8
++ mov r4, r4, lsr #align*8
++ orr r4, r4, r5, lsl #32-align*8
++ mov r5, r5, lsr #align*8
++ orr r5, r5, r6, lsl #32-align*8
++ mov r6, r6, lsr #align*8
++ orr r6, r6, r7, lsl #32-align*8
++ mov r7, r7, lsr #align*8
++ orr r7, r7, r8, lsl #32-align*8
++ stmia D!, {r0, r1, r2, r3}
++ stmia D!, {r4, r5, r6, r7}
++ .endif
++ .endif
++.endm
++
++.macro memcpy_leading_15bytes backwards, align
++ movs DAT1, DAT2, lsl #31
++ sub N, N, DAT2
++ .if backwards
++ ldrmib DAT0, [S, #-1]!
++ ldrcsh DAT1, [S, #-2]!
++ strmib DAT0, [D, #-1]!
++ strcsh DAT1, [D, #-2]!
++ .else
++ ldrmib DAT0, [S], #1
++ ldrcsh DAT1, [S], #2
++ strmib DAT0, [D], #1
++ strcsh DAT1, [D], #2
++ .endif
++ movs DAT1, DAT2, lsl #29
++ .if backwards
++ ldrmi DAT0, [S, #-4]!
++ .if align == 0
++ ldmcsdb S!, {DAT1, DAT2}
++ .else
++ ldrcs DAT2, [S, #-4]!
++ ldrcs DAT1, [S, #-4]!
++ .endif
++ strmi DAT0, [D, #-4]!
++ stmcsdb D!, {DAT1, DAT2}
++ .else
++ ldrmi DAT0, [S], #4
++ .if align == 0
++ ldmcsia S!, {DAT1, DAT2}
++ .else
++ ldrcs DAT1, [S], #4
++ ldrcs DAT2, [S], #4
++ .endif
++ strmi DAT0, [D], #4
++ stmcsia D!, {DAT1, DAT2}
++ .endif
++.endm
++
++.macro memcpy_trailing_15bytes backwards, align
++ movs N, N, lsl #29
++ .if backwards
++ .if align == 0
++ ldmcsdb S!, {DAT0, DAT1}
++ .else
++ ldrcs DAT1, [S, #-4]!
++ ldrcs DAT0, [S, #-4]!
++ .endif
++ ldrmi DAT2, [S, #-4]!
++ stmcsdb D!, {DAT0, DAT1}
++ strmi DAT2, [D, #-4]!
++ .else
++ .if align == 0
++ ldmcsia S!, {DAT0, DAT1}
++ .else
++ ldrcs DAT0, [S], #4
++ ldrcs DAT1, [S], #4
++ .endif
++ ldrmi DAT2, [S], #4
++ stmcsia D!, {DAT0, DAT1}
++ strmi DAT2, [D], #4
++ .endif
++ movs N, N, lsl #2
++ .if backwards
++ ldrcsh DAT0, [S, #-2]!
++ ldrmib DAT1, [S, #-1]
++ strcsh DAT0, [D, #-2]!
++ strmib DAT1, [D, #-1]
++ .else
++ ldrcsh DAT0, [S], #2
++ ldrmib DAT1, [S]
++ strcsh DAT0, [D], #2
++ strmib DAT1, [D]
++ .endif
++.endm
++
++.macro memcpy_long_inner_loop backwards, align
++ .if align != 0
++ .if backwards
++ ldr DAT0, [S, #-align]!
++ .else
++ ldr LAST, [S, #-align]!
++ .endif
++ .endif
++110:
++ .if align == 0
++ .if backwards
++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++ pld [S, OFF]
++ stmdb D!, {DAT4, DAT5, DAT6, LAST}
++ stmdb D!, {DAT0, DAT1, DAT2, DAT3}
++ .else
++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++ pld [S, OFF]
++ stmia D!, {DAT0, DAT1, DAT2, DAT3}
++ stmia D!, {DAT4, DAT5, DAT6, LAST}
++ .endif
++ .else
++ unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++ subs N, N, #32
++ bhs 110b
++ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
++ preload_trailing backwards, S, N, OFF
++ add N, N, #(prefetch_distance+2)*32 - 32
++120:
++ .if align == 0
++ .if backwards
++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++ stmdb D!, {DAT4, DAT5, DAT6, LAST}
++ stmdb D!, {DAT0, DAT1, DAT2, DAT3}
++ .else
++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
++ stmia D!, {DAT0, DAT1, DAT2, DAT3}
++ stmia D!, {DAT4, DAT5, DAT6, LAST}
++ .endif
++ .else
++ unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
++ .endif
++ subs N, N, #32
++ bhs 120b
++ tst N, #16
++ .if align == 0
++ .if backwards
++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++ stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++ stmneia D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++ .else
++ beq 130f
++ unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
++130:
++ .endif
++ /* Trailing words and bytes */
++ tst N, #15
++ beq 199f
++ .if align != 0
++ add S, S, #align
++ .endif
++ memcpy_trailing_15bytes backwards, align
++199:
++ pop {DAT3, DAT4, DAT5, DAT6, DAT7}
++ pop {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_medium_inner_loop backwards, align
++120:
++ .if backwards
++ .if align == 0
++ ldmdb S!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ ldr LAST, [S, #-4]!
++ ldr DAT2, [S, #-4]!
++ ldr DAT1, [S, #-4]!
++ ldr DAT0, [S, #-4]!
++ .endif
++ stmdb D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ .if align == 0
++ ldmia S!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ ldr DAT0, [S], #4
++ ldr DAT1, [S], #4
++ ldr DAT2, [S], #4
++ ldr LAST, [S], #4
++ .endif
++ stmia D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++ subs N, N, #16
++ bhs 120b
++ /* Trailing words and bytes */
++ tst N, #15
++ beq 199f
++ memcpy_trailing_15bytes backwards, align
++199:
++ pop {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy_short_inner_loop backwards, align
++ tst N, #16
++ .if backwards
++ .if align == 0
++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ ldrne LAST, [S, #-4]!
++ ldrne DAT2, [S, #-4]!
++ ldrne DAT1, [S, #-4]!
++ ldrne DAT0, [S, #-4]!
++ .endif
++ stmnedb D!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ .if align == 0
++ ldmneia S!, {DAT0, DAT1, DAT2, LAST}
++ .else
++ ldrne DAT0, [S], #4
++ ldrne DAT1, [S], #4
++ ldrne DAT2, [S], #4
++ ldrne LAST, [S], #4
++ .endif
++ stmneia D!, {DAT0, DAT1, DAT2, LAST}
++ .endif
++ memcpy_trailing_15bytes backwards, align
++199:
++ pop {D, DAT1, DAT2, pc}
++.endm
++
++.macro memcpy backwards
++ D .req a1
++ S .req a2
++ N .req a3
++ DAT0 .req a4
++ DAT1 .req v1
++ DAT2 .req v2
++ DAT3 .req v3
++ DAT4 .req v4
++ DAT5 .req v5
++ DAT6 .req v6
++ DAT7 .req sl
++ LAST .req ip
++ OFF .req lr
++
++ .cfi_startproc
++
++ push {D, DAT1, DAT2, lr}
++
++ .cfi_def_cfa_offset 16
++ .cfi_rel_offset D, 0
++ .cfi_undefined S
++ .cfi_undefined N
++ .cfi_undefined DAT0
++ .cfi_rel_offset DAT1, 4
++ .cfi_rel_offset DAT2, 8
++ .cfi_undefined LAST
++ .cfi_rel_offset lr, 12
++
++ .if backwards
++ add D, D, N
++ add S, S, N
++ .endif
++
++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++ cmp N, #31
++ blo 170f
++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
++ cmp N, #(prefetch_distance+3)*32 - 1
++ blo 160f
++
++ /* Long case */
++ push {DAT3, DAT4, DAT5, DAT6, DAT7}
++
++ .cfi_def_cfa_offset 36
++ .cfi_rel_offset D, 20
++ .cfi_rel_offset DAT1, 24
++ .cfi_rel_offset DAT2, 28
++ .cfi_rel_offset DAT3, 0
++ .cfi_rel_offset DAT4, 4
++ .cfi_rel_offset DAT5, 8
++ .cfi_rel_offset DAT6, 12
++ .cfi_rel_offset DAT7, 16
++ .cfi_rel_offset lr, 32
++
++ /* Adjust N so that the decrement instruction can also test for
++ * inner loop termination. We want it to stop when there are
++ * (prefetch_distance+1) complete blocks to go. */
++ sub N, N, #(prefetch_distance+2)*32
++ preload_leading_step1 backwards, DAT0, S
++ .if backwards
++ /* Bug in GAS: it accepts, but mis-assembles the instruction
++ * ands DAT2, D, #60, 2
++ * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
++ */
++ .word 0xE210513C
++ beq 154f
++ .else
++ ands DAT2, D, #15
++ beq 154f
++ rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
++ .endif
++ preload_leading_step2 backwards, DAT0, S, DAT2, OFF
++ memcpy_leading_15bytes backwards, 1
++154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
++ /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
++ .if backwards
++ rsb OFF, S, #3
++ and OFF, OFF, #28
++ sub OFF, OFF, #32*(prefetch_distance+1)
++ .else
++ and OFF, S, #28
++ rsb OFF, OFF, #32*prefetch_distance
++ .endif
++ movs DAT0, S, lsl #31
++ bhi 157f
++ bcs 156f
++ bmi 155f
++ memcpy_long_inner_loop backwards, 0
++155: memcpy_long_inner_loop backwards, 1
++156: memcpy_long_inner_loop backwards, 2
++157: memcpy_long_inner_loop backwards, 3
++
++ .cfi_def_cfa_offset 16
++ .cfi_rel_offset D, 0
++ .cfi_rel_offset DAT1, 4
++ .cfi_rel_offset DAT2, 8
++ .cfi_same_value DAT3
++ .cfi_same_value DAT4
++ .cfi_same_value DAT5
++ .cfi_same_value DAT6
++ .cfi_same_value DAT7
++ .cfi_rel_offset lr, 12
++
++160: /* Medium case */
++ preload_all backwards, 0, 0, S, N, DAT2, OFF
++ sub N, N, #16 /* simplifies inner loop termination */
++ .if backwards
++ ands DAT2, D, #15
++ beq 164f
++ .else
++ ands DAT2, D, #15
++ beq 164f
++ rsb DAT2, DAT2, #16
++ .endif
++ memcpy_leading_15bytes backwards, align
++164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
++ tst S, #3
++ bne 140f
++ memcpy_medium_inner_loop backwards, 0
++140: memcpy_medium_inner_loop backwards, 1
++
++170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
++ teq N, #0
++ beq 199f
++ preload_all backwards, 1, 0, S, N, DAT2, LAST
++ tst D, #3
++ beq 174f
++172: subs N, N, #1
++ blo 199f
++ .if backwards
++ ldrb DAT0, [S, #-1]!
++ strb DAT0, [D, #-1]!
++ .else
++ ldrb DAT0, [S], #1
++ strb DAT0, [D], #1
++ .endif
++ tst D, #3
++ bne 172b
++174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
++ tst S, #3
++ bne 140f
++ memcpy_short_inner_loop backwards, 0
++140: memcpy_short_inner_loop backwards, 1
++
++ .cfi_endproc
++
++ .unreq D
++ .unreq S
++ .unreq N
++ .unreq DAT0
++ .unreq DAT1
++ .unreq DAT2
++ .unreq DAT3
++ .unreq DAT4
++ .unreq DAT5
++ .unreq DAT6
++ .unreq DAT7
++ .unreq LAST
++ .unreq OFF
++.endm
+--- /dev/null
++++ b/arch/arm/lib/memmove_rpi.S
+@@ -0,0 +1,61 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++#include "memcpymove.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++ .text
++ .arch armv6
++ .object_arch armv4
++ .arm
++ .altmacro
++ .p2align 2
++
++/*
++ * void *memmove(void *s1, const void *s2, size_t n);
++ * On entry:
++ * a1 = pointer to destination
++ * a2 = pointer to source
++ * a3 = number of bytes to copy
++ * On exit:
++ * a1 preserved
++ */
++
++.set prefetch_distance, 3
++
++ENTRY(memmove)
++ cmp a2, a1
++ bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
++ memcpy 1
++ENDPROC(memmove)
+--- /dev/null
++++ b/arch/arm/lib/memset_rpi.S
+@@ -0,0 +1,128 @@
++/*
++Copyright (c) 2013, Raspberry Pi Foundation
++Copyright (c) 2013, RISC OS Open Ltd
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <linux/linkage.h>
++#include "arm-mem.h"
++
++/* Prevent the stack from becoming executable */
++#if defined(__linux__) && defined(__ELF__)
++.section .note.GNU-stack,"",%progbits
++#endif
++
++ .text
++ .arch armv6
++ .object_arch armv4
++ .arm
++ .altmacro
++ .p2align 2
++
++/*
++ * void *memset(void *s, int c, size_t n);
++ * On entry:
++ * a1 = pointer to buffer to fill
++ * a2 = byte pattern to fill with (caller-narrowed)
++ * a3 = number of bytes to fill
++ * On exit:
++ * a1 preserved
++ */
++ENTRY(mmioset)
++ENTRY(memset)
++ENTRY(__memset32)
++ENTRY(__memset64)
++
++ S .req a1
++ DAT0 .req a2
++ N .req a3
++ DAT1 .req a4
++ DAT2 .req ip
++ DAT3 .req lr
++
++ orr DAT0, DAT0, DAT0, lsl #8
++ push {S, lr}
++ orr DAT0, DAT0, DAT0, lsl #16
++ mov DAT1, DAT0
++
++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
++ cmp N, #31
++ blo 170f
++
++161: sub N, N, #16 /* simplifies inner loop termination */
++ /* Leading words and bytes */
++ tst S, #15
++ beq 164f
++ rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
++ movs DAT2, DAT3, lsl #31
++ submi N, N, #1
++ strmib DAT0, [S], #1
++ subcs N, N, #2
++ strcsh DAT0, [S], #2
++ movs DAT2, DAT3, lsl #29
++ submi N, N, #4
++ strmi DAT0, [S], #4
++ subcs N, N, #8
++ stmcsia S!, {DAT0, DAT1}
++164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
++ mov DAT2, DAT0
++ mov DAT3, DAT0
++ /* Now the inner loop of 16-byte stores */
++165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
++ subs N, N, #16
++ bhs 165b
++166: /* Trailing words and bytes */
++ movs N, N, lsl #29
++ stmcsia S!, {DAT0, DAT1}
++ strmi DAT0, [S], #4
++ movs N, N, lsl #2
++ strcsh DAT0, [S], #2
++ strmib DAT0, [S]
++199: pop {S, pc}
++
++170: /* Short case */
++ mov DAT2, DAT0
++ mov DAT3, DAT0
++ tst S, #3
++ beq 174f
++172: subs N, N, #1
++ blo 199b
++ strb DAT0, [S], #1
++ tst S, #3
++ bne 172b
++174: tst N, #16
++ stmneia S!, {DAT0, DAT1, DAT2, DAT3}
++ b 166b
++
++ .unreq S
++ .unreq DAT0
++ .unreq N
++ .unreq DAT1
++ .unreq DAT2
++ .unreq DAT3
++ENDPROC(__memset64)
++ENDPROC(__memset32)
++ENDPROC(memset)
++ENDPROC(mmioset)
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -22,6 +22,14 @@
+ #include <asm/current.h>
+ #include <asm/page.h>
+
++#ifndef COPY_FROM_USER_THRESHOLD
++#define COPY_FROM_USER_THRESHOLD 64
++#endif
++
++#ifndef COPY_TO_USER_THRESHOLD
++#define COPY_TO_USER_THRESHOLD 64
++#endif
++
+ static int
+ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ {
+@@ -84,7 +92,44 @@ pin_page_for_write(const void __user *_a
+ return 1;
+ }
+
+-static unsigned long noinline
++static int
++pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
++{
++ unsigned long addr = (unsigned long)_addr;
++ pgd_t *pgd;
++ pmd_t *pmd;
++ pte_t *pte;
++ pud_t *pud;
++ spinlock_t *ptl;
++
++ pgd = pgd_offset(current->mm, addr);
++ if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
++ {
++ return 0;
++ }
++ pud = pud_offset(pgd, addr);
++ if (unlikely(pud_none(*pud) || pud_bad(*pud)))
++ {
++ return 0;
++ }
++
++ pmd = pmd_offset(pud, addr);
++ if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
++ return 0;
++
++ pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
++ if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
++ pte_unmap_unlock(pte, ptl);
++ return 0;
++ }
++
++ *ptep = pte;
++ *ptlp = ptl;
++
++ return 1;
++}
++
++unsigned long noinline
+ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+ unsigned long ua_flags;
+@@ -137,6 +182,57 @@ out:
+ return n;
+ }
+
++unsigned long noinline
++__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
++{
++ unsigned long ua_flags;
++ int atomic;
++
++ if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
++ memcpy(to, (const void *)from, n);
++ return 0;
++ }
++
++ /* the mmap semaphore is taken only if not in an atomic context */
++ atomic = in_atomic();
++
++ if (!atomic)
++ down_read(&current->mm->mmap_sem);
++ while (n) {
++ pte_t *pte;
++ spinlock_t *ptl;
++ int tocopy;
++
++ while (!pin_page_for_read(from, &pte, &ptl)) {
++ char temp;
++ if (!atomic)
++ up_read(&current->mm->mmap_sem);
++ if (__get_user(temp, (char __user *)from))
++ goto out;
++ if (!atomic)
++ down_read(&current->mm->mmap_sem);
++ }
++
++ tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
++ if (tocopy > n)
++ tocopy = n;
++
++ ua_flags = uaccess_save_and_enable();
++ memcpy(to, (const void *)from, tocopy);
++ uaccess_restore(ua_flags);
++ to += tocopy;
++ from += tocopy;
++ n -= tocopy;
++
++ pte_unmap_unlock(pte, ptl);
++ }
++ if (!atomic)
++ up_read(&current->mm->mmap_sem);
++
++out:
++ return n;
++}
++
+ unsigned long
+ arm_copy_to_user(void __user *to, const void *from, unsigned long n)
+ {
+@@ -147,7 +243,7 @@ arm_copy_to_user(void __user *to, const
+ * With frame pointer disabled, tail call optimization kicks in
+ * as well making this test almost invisible.
+ */
+- if (n < 64) {
++ if (n < COPY_TO_USER_THRESHOLD) {
+ unsigned long ua_flags = uaccess_save_and_enable();
+ n = __copy_to_user_std(to, from, n);
+ uaccess_restore(ua_flags);
+@@ -157,6 +253,26 @@ arm_copy_to_user(void __user *to, const
+ }
+ return n;
+ }
++
++unsigned long __must_check
++arm_copy_from_user(void *to, const void __user *from, unsigned long n)
++{
++ /*
++ * This test is stubbed out of the main function above to keep
++ * the overhead for small copies low by avoiding a large
++ * register dump on the stack just to reload them right away.
++ * With frame pointer disabled, tail call optimization kicks in
++ * as well making this test almost invisible.
++ */
++ if (n < COPY_TO_USER_THRESHOLD) {
++ unsigned long ua_flags = uaccess_save_and_enable();
++ n = __copy_from_user_std(to, from, n);
++ uaccess_restore(ua_flags);
++ } else {
++ n = __copy_from_user_memcpy(to, from, n);
++ }
++ return n;
++}
+
+ static unsigned long noinline
+ __clear_user_memset(void __user *addr, unsigned long n)
+--- a/arch/arm/mach-bcm/Kconfig
++++ b/arch/arm/mach-bcm/Kconfig
+@@ -187,6 +187,13 @@ config ARCH_BCM_53573
+ The base chip is BCM53573 and there are some packaging modifications
+ like BCM47189 and BCM47452.
+
++config BCM2835_FAST_MEMCPY
++ bool "Enable optimized __copy_to_user and __copy_from_user"
++ depends on ARCH_BCM2835 && ARCH_MULTI_V6
++ default y
++ help
++ Optimized versions of __copy_to_user and __copy_from_user for Pi1.
++
+ config ARCH_BCM_63XX
+ bool "Broadcom BCM63xx DSL SoC"
+ depends on ARCH_MULTI_V7