From a1383655cfaa71609d6236ae0fcf3b6047462b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Sat, 29 Feb 2020 09:25:20 +0100 Subject: bcm27xx: add linux 5.4 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tested on bcm2710 (Raspberry Pi 3B). Signed-off-by: Álvaro Fernández Rojas --- ...opy_to_user-and-__copy_from_user-performa.patch | 1549 ++++++++++++++++++++ 1 file changed, 1549 insertions(+) create mode 100644 target/linux/bcm27xx/patches-5.4/950-0056-Improve-__copy_to_user-and-__copy_from_user-performa.patch (limited to 'target/linux/bcm27xx/patches-5.4/950-0056-Improve-__copy_to_user-and-__copy_from_user-performa.patch') diff --git a/target/linux/bcm27xx/patches-5.4/950-0056-Improve-__copy_to_user-and-__copy_from_user-performa.patch b/target/linux/bcm27xx/patches-5.4/950-0056-Improve-__copy_to_user-and-__copy_from_user-performa.patch new file mode 100644 index 0000000000..9b0257d3a4 --- /dev/null +++ b/target/linux/bcm27xx/patches-5.4/950-0056-Improve-__copy_to_user-and-__copy_from_user-performa.patch @@ -0,0 +1,1549 @@ +From 857117cae13c214c709931c5f84e67249c7a3c81 Mon Sep 17 00:00:00 2001 +From: popcornmix +Date: Mon, 28 Nov 2016 16:50:04 +0000 +Subject: [PATCH] Improve __copy_to_user and __copy_from_user + performance + +Provide a __copy_from_user that uses memcpy. On BCM2708, use +optimised memcpy/memmove/memcmp/memset implementations. + +arch/arm: Add mmiocpy/set aliases for memcpy/set + +See: https://github.com/raspberrypi/linux/issues/1082 + +copy_from_user: CPU_SW_DOMAIN_PAN compatibility + +The downstream copy_from_user acceleration must also play nice with +CONFIG_CPU_SW_DOMAIN_PAN. + +See: https://github.com/raspberrypi/linux/issues/1381 + +Signed-off-by: Phil Elwell +--- + arch/arm/include/asm/string.h | 5 + + arch/arm/include/asm/uaccess.h | 3 + + arch/arm/lib/Makefile | 14 +- + arch/arm/lib/arm-mem.h | 159 +++++++++ + arch/arm/lib/copy_from_user.S | 4 +- + arch/arm/lib/exports_rpi.c | 37 +++ + arch/arm/lib/memcmp_rpi.S | 285 ++++++++++++++++ + arch/arm/lib/memcpy_rpi.S | 61 ++++ + arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++ + arch/arm/lib/memmove_rpi.S | 61 ++++ + arch/arm/lib/memset_rpi.S | 128 ++++++++ + arch/arm/lib/uaccess_with_memcpy.c | 120 ++++++- + arch/arm/mach-bcm/Kconfig | 7 + + 13 files changed, 1385 insertions(+), 5 deletions(-) + create mode 100644 arch/arm/lib/arm-mem.h + create mode 100644 arch/arm/lib/exports_rpi.c + create mode 100644 arch/arm/lib/memcmp_rpi.S + create mode 100644 arch/arm/lib/memcpy_rpi.S + create mode 100644 arch/arm/lib/memcpymove.h + create mode 100644 arch/arm/lib/memmove_rpi.S + create mode 100644 arch/arm/lib/memset_rpi.S + +--- a/arch/arm/include/asm/string.h ++++ b/arch/arm/include/asm/string.h +@@ -39,4 +39,9 @@ static inline void *memset64(uint64_t *p + return __memset64(p, v, n * 8, v >> 32); + } + ++#ifdef CONFIG_BCM2835_FAST_MEMCPY ++#define __HAVE_ARCH_MEMCMP ++extern int memcmp(const void *, const void *, size_t); ++#endif ++ + #endif +--- a/arch/arm/include/asm/uaccess.h ++++ b/arch/arm/include/asm/uaccess.h +@@ -512,6 +512,9 @@ do { \ + extern unsigned long __must_check + arm_copy_from_user(void *to, const void __user *from, unsigned long n); + ++extern unsigned long __must_check ++__copy_from_user_std(void *to, const void __user *from, unsigned long n); ++ + static inline unsigned long __must_check + raw_copy_from_user(void *to, const void __user *from, unsigned long n) + { +--- a/arch/arm/lib/Makefile ++++ b/arch/arm/lib/Makefile +@@ -7,8 +7,8 @@ + + lib-y := changebit.o csumipv6.o csumpartial.o \ + csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ +- delay.o delay-loop.o findbit.o memchr.o memcpy.o \ +- memmove.o memset.o setbit.o \ ++ delay.o delay-loop.o findbit.o memchr.o \ ++ setbit.o \ + strchr.o strrchr.o \ + testchangebit.o testclearbit.o testsetbit.o \ + ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ +@@ -25,6 +25,16 @@ else + lib-y += backtrace.o + endif + ++# Choose optimised implementations for Raspberry Pi ++ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y) ++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600 ++ CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672 ++ obj-$(CONFIG_MODULES) += exports_rpi.o ++ lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o ++else ++ lib-y += memcpy.o memmove.o memset.o ++endif ++ + # using lib_ here won't override already available weak symbols + obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o + +--- /dev/null ++++ b/arch/arm/lib/arm-mem.h +@@ -0,0 +1,159 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++.macro myfunc fname ++ .func fname ++ .global fname ++fname: ++.endm ++ ++.macro preload_leading_step1 backwards, ptr, base ++/* If the destination is already 16-byte aligned, then we need to preload ++ * between 0 and prefetch_distance (inclusive) cache lines ahead so there ++ * are no gaps when the inner loop starts. ++ */ ++ .if backwards ++ sub ptr, base, #1 ++ bic ptr, ptr, #31 ++ .else ++ bic ptr, base, #31 ++ .endif ++ .set OFFSET, 0 ++ .rept prefetch_distance+1 ++ pld [ptr, #OFFSET] ++ .if backwards ++ .set OFFSET, OFFSET-32 ++ .else ++ .set OFFSET, OFFSET+32 ++ .endif ++ .endr ++.endm ++ ++.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp ++/* However, if the destination is not 16-byte aligned, we may need to ++ * preload one more cache line than that. The question we need to ask is: ++ * are the leading bytes more than the amount by which the source ++ * pointer will be rounded down for preloading, and if so, by how many ++ * cache lines? ++ */ ++ .if backwards ++/* Here we compare against how many bytes we are into the ++ * cache line, counting down from the highest such address. ++ * Effectively, we want to calculate ++ * leading_bytes = dst&15 ++ * cacheline_offset = 31-((src-leading_bytes-1)&31) ++ * extra_needed = leading_bytes - cacheline_offset ++ * and test if extra_needed is <= 0, or rearranging: ++ * leading_bytes + (src-leading_bytes-1)&31 <= 31 ++ */ ++ mov tmp, base, lsl #32-5 ++ sbc tmp, tmp, leading_bytes, lsl #32-5 ++ adds tmp, tmp, leading_bytes, lsl #32-5 ++ bcc 61f ++ pld [ptr, #-32*(prefetch_distance+1)] ++ .else ++/* Effectively, we want to calculate ++ * leading_bytes = (-dst)&15 ++ * cacheline_offset = (src+leading_bytes)&31 ++ * extra_needed = leading_bytes - cacheline_offset ++ * and test if extra_needed is <= 0. ++ */ ++ mov tmp, base, lsl #32-5 ++ add tmp, tmp, leading_bytes, lsl #32-5 ++ rsbs tmp, tmp, leading_bytes, lsl #32-5 ++ bls 61f ++ pld [ptr, #32*(prefetch_distance+1)] ++ .endif ++61: ++.endm ++ ++.macro preload_trailing backwards, base, remain, tmp ++ /* We need either 0, 1 or 2 extra preloads */ ++ .if backwards ++ rsb tmp, base, #0 ++ mov tmp, tmp, lsl #32-5 ++ .else ++ mov tmp, base, lsl #32-5 ++ .endif ++ adds tmp, tmp, remain, lsl #32-5 ++ adceqs tmp, tmp, #0 ++ /* The instruction above has two effects: ensures Z is only ++ * set if C was clear (so Z indicates that both shifted quantities ++ * were 0), and clears C if Z was set (so C indicates that the sum ++ * of the shifted quantities was greater and not equal to 32) */ ++ beq 82f ++ .if backwards ++ sub tmp, base, #1 ++ bic tmp, tmp, #31 ++ .else ++ bic tmp, base, #31 ++ .endif ++ bcc 81f ++ .if backwards ++ pld [tmp, #-32*(prefetch_distance+1)] ++81: ++ pld [tmp, #-32*prefetch_distance] ++ .else ++ pld [tmp, #32*(prefetch_distance+2)] ++81: ++ pld [tmp, #32*(prefetch_distance+1)] ++ .endif ++82: ++.endm ++ ++.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1 ++ .if backwards ++ sub tmp0, base, #1 ++ bic tmp0, tmp0, #31 ++ pld [tmp0] ++ sub tmp1, base, remain, lsl #shift ++ .else ++ bic tmp0, base, #31 ++ pld [tmp0] ++ add tmp1, base, remain, lsl #shift ++ sub tmp1, tmp1, #1 ++ .endif ++ bic tmp1, tmp1, #31 ++ cmp tmp1, tmp0 ++ beq 92f ++ .if narrow_case ++ /* In this case, all the data fits in either 1 or 2 cache lines */ ++ pld [tmp1] ++ .else ++91: ++ .if backwards ++ sub tmp0, tmp0, #32 ++ .else ++ add tmp0, tmp0, #32 ++ .endif ++ cmp tmp0, tmp1 ++ pld [tmp0] ++ bne 91b ++ .endif ++92: ++.endm +--- a/arch/arm/lib/copy_from_user.S ++++ b/arch/arm/lib/copy_from_user.S +@@ -107,7 +107,8 @@ + + .text + +-ENTRY(arm_copy_from_user) ++ENTRY(__copy_from_user_std) ++WEAK(arm_copy_from_user) + #ifdef CONFIG_CPU_SPECTRE + get_thread_info r3 + ldr r3, [r3, #TI_ADDR_LIMIT] +@@ -117,6 +118,7 @@ ENTRY(arm_copy_from_user) + #include "copy_template.S" + + ENDPROC(arm_copy_from_user) ++ENDPROC(__copy_from_user_std) + + .pushsection .fixup,"ax" + .align 0 +--- /dev/null ++++ b/arch/arm/lib/exports_rpi.c +@@ -0,0 +1,37 @@ ++/** ++ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions, and the following disclaimer, ++ * without modification. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The names of the above-listed copyright holders may not be used ++ * to endorse or promote products derived from this software without ++ * specific prior written permission. ++ * ++ * ALTERNATIVELY, this software may be distributed under the terms of the ++ * GNU General Public License ("GPL") version 2, as published by the Free ++ * Software Foundation. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS ++ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++EXPORT_SYMBOL(memcmp); +--- /dev/null ++++ b/arch/arm/lib/memcmp_rpi.S +@@ -0,0 +1,285 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++.macro memcmp_process_head unaligned ++ .if unaligned ++ ldr DAT0, [S_1], #4 ++ ldr DAT1, [S_1], #4 ++ ldr DAT2, [S_1], #4 ++ ldr DAT3, [S_1], #4 ++ .else ++ ldmia S_1!, {DAT0, DAT1, DAT2, DAT3} ++ .endif ++ ldmia S_2!, {DAT4, DAT5, DAT6, DAT7} ++.endm ++ ++.macro memcmp_process_tail ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ cmpeq DAT3, DAT7 ++ bne 200f ++.endm ++ ++.macro memcmp_leading_31bytes ++ movs DAT0, OFF, lsl #31 ++ ldrmib DAT0, [S_1], #1 ++ ldrcsh DAT1, [S_1], #2 ++ ldrmib DAT4, [S_2], #1 ++ ldrcsh DAT5, [S_2], #2 ++ movpl DAT0, #0 ++ movcc DAT1, #0 ++ movpl DAT4, #0 ++ movcc DAT5, #0 ++ submi N, N, #1 ++ subcs N, N, #2 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ bne 200f ++ movs DAT0, OFF, lsl #29 ++ ldrmi DAT0, [S_1], #4 ++ ldrcs DAT1, [S_1], #4 ++ ldrcs DAT2, [S_1], #4 ++ ldrmi DAT4, [S_2], #4 ++ ldmcsia S_2!, {DAT5, DAT6} ++ movpl DAT0, #0 ++ movcc DAT1, #0 ++ movcc DAT2, #0 ++ movpl DAT4, #0 ++ movcc DAT5, #0 ++ movcc DAT6, #0 ++ submi N, N, #4 ++ subcs N, N, #8 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ bne 200f ++ tst OFF, #16 ++ beq 105f ++ memcmp_process_head 1 ++ sub N, N, #16 ++ memcmp_process_tail ++105: ++.endm ++ ++.macro memcmp_trailing_15bytes unaligned ++ movs N, N, lsl #29 ++ .if unaligned ++ ldrcs DAT0, [S_1], #4 ++ ldrcs DAT1, [S_1], #4 ++ .else ++ ldmcsia S_1!, {DAT0, DAT1} ++ .endif ++ ldrmi DAT2, [S_1], #4 ++ ldmcsia S_2!, {DAT4, DAT5} ++ ldrmi DAT6, [S_2], #4 ++ movcc DAT0, #0 ++ movcc DAT1, #0 ++ movpl DAT2, #0 ++ movcc DAT4, #0 ++ movcc DAT5, #0 ++ movpl DAT6, #0 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ cmpeq DAT2, DAT6 ++ bne 200f ++ movs N, N, lsl #2 ++ ldrcsh DAT0, [S_1], #2 ++ ldrmib DAT1, [S_1] ++ ldrcsh DAT4, [S_2], #2 ++ ldrmib DAT5, [S_2] ++ movcc DAT0, #0 ++ movpl DAT1, #0 ++ movcc DAT4, #0 ++ movpl DAT5, #0 ++ cmp DAT0, DAT4 ++ cmpeq DAT1, DAT5 ++ bne 200f ++.endm ++ ++.macro memcmp_long_inner_loop unaligned ++110: ++ memcmp_process_head unaligned ++ pld [S_2, #prefetch_distance*32 + 16] ++ memcmp_process_tail ++ memcmp_process_head unaligned ++ pld [S_1, OFF] ++ memcmp_process_tail ++ subs N, N, #32 ++ bhs 110b ++ /* Just before the final (prefetch_distance+1) 32-byte blocks, ++ * deal with final preloads */ ++ preload_trailing 0, S_1, N, DAT0 ++ preload_trailing 0, S_2, N, DAT0 ++ add N, N, #(prefetch_distance+2)*32 - 16 ++120: ++ memcmp_process_head unaligned ++ memcmp_process_tail ++ subs N, N, #16 ++ bhs 120b ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcmp_trailing_15bytes unaligned ++199: /* Reached end without detecting a difference */ ++ mov a1, #0 ++ setend le ++ pop {DAT1-DAT6, pc} ++.endm ++ ++.macro memcmp_short_inner_loop unaligned ++ subs N, N, #16 /* simplifies inner loop termination */ ++ blo 122f ++120: ++ memcmp_process_head unaligned ++ memcmp_process_tail ++ subs N, N, #16 ++ bhs 120b ++122: /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcmp_trailing_15bytes unaligned ++199: /* Reached end without detecting a difference */ ++ mov a1, #0 ++ setend le ++ pop {DAT1-DAT6, pc} ++.endm ++ ++/* ++ * int memcmp(const void *s1, const void *s2, size_t n); ++ * On entry: ++ * a1 = pointer to buffer 1 ++ * a2 = pointer to buffer 2 ++ * a3 = number of bytes to compare (as unsigned chars) ++ * On exit: ++ * a1 = >0/=0/<0 if s1 >/=/< s2 ++ */ ++ ++.set prefetch_distance, 2 ++ ++ENTRY(memcmp) ++ S_1 .req a1 ++ S_2 .req a2 ++ N .req a3 ++ DAT0 .req a4 ++ DAT1 .req v1 ++ DAT2 .req v2 ++ DAT3 .req v3 ++ DAT4 .req v4 ++ DAT5 .req v5 ++ DAT6 .req v6 ++ DAT7 .req ip ++ OFF .req lr ++ ++ push {DAT1-DAT6, lr} ++ setend be /* lowest-addressed bytes are most significant */ ++ ++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ ++ cmp N, #(prefetch_distance+3)*32 - 1 ++ blo 170f ++ ++ /* Long case */ ++ /* Adjust N so that the decrement instruction can also test for ++ * inner loop termination. We want it to stop when there are ++ * (prefetch_distance+1) complete blocks to go. */ ++ sub N, N, #(prefetch_distance+2)*32 ++ preload_leading_step1 0, DAT0, S_1 ++ preload_leading_step1 0, DAT1, S_2 ++ tst S_2, #31 ++ beq 154f ++ rsb OFF, S_2, #0 /* no need to AND with 15 here */ ++ preload_leading_step2 0, DAT0, S_1, OFF, DAT2 ++ preload_leading_step2 0, DAT1, S_2, OFF, DAT2 ++ memcmp_leading_31bytes ++154: /* Second source now cacheline (32-byte) aligned; we have at ++ * least one prefetch to go. */ ++ /* Prefetch offset is best selected such that it lies in the ++ * first 8 of each 32 bytes - but it's just as easy to aim for ++ * the first one */ ++ and OFF, S_1, #31 ++ rsb OFF, OFF, #32*prefetch_distance ++ tst S_1, #3 ++ bne 140f ++ memcmp_long_inner_loop 0 ++140: memcmp_long_inner_loop 1 ++ ++170: /* Short case */ ++ teq N, #0 ++ beq 199f ++ preload_all 0, 0, 0, S_1, N, DAT0, DAT1 ++ preload_all 0, 0, 0, S_2, N, DAT0, DAT1 ++ tst S_2, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199f ++ ldrb DAT0, [S_1], #1 ++ ldrb DAT4, [S_2], #1 ++ cmp DAT0, DAT4 ++ bne 200f ++ tst S_2, #3 ++ bne 172b ++174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */ ++ tst S_1, #3 ++ bne 140f ++ memcmp_short_inner_loop 0 ++140: memcmp_short_inner_loop 1 ++ ++200: /* Difference found: determine sign. */ ++ movhi a1, #1 ++ movlo a1, #-1 ++ setend le ++ pop {DAT1-DAT6, pc} ++ ++ .unreq S_1 ++ .unreq S_2 ++ .unreq N ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq DAT4 ++ .unreq DAT5 ++ .unreq DAT6 ++ .unreq DAT7 ++ .unreq OFF ++ENDPROC(memcmp) +--- /dev/null ++++ b/arch/arm/lib/memcpy_rpi.S +@@ -0,0 +1,61 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++#include "memcpymove.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n); ++ * On entry: ++ * a1 = pointer to destination ++ * a2 = pointer to source ++ * a3 = number of bytes to copy ++ * On exit: ++ * a1 preserved ++ */ ++ ++.set prefetch_distance, 3 ++ ++ENTRY(mmiocpy) ++ENTRY(memcpy) ++ memcpy 0 ++ENDPROC(memcpy) ++ENDPROC(mmiocpy) +--- /dev/null ++++ b/arch/arm/lib/memcpymove.h +@@ -0,0 +1,506 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8 ++ .if words == 1 ++ .if backwards ++ mov r1, r0, lsl #32-align*8 ++ ldr r0, [S, #-4]! ++ orr r1, r1, r0, lsr #align*8 ++ str r1, [D, #-4]! ++ .else ++ mov r0, r1, lsr #align*8 ++ ldr r1, [S, #4]! ++ orr r0, r0, r1, lsl #32-align*8 ++ str r0, [D], #4 ++ .endif ++ .elseif words == 2 ++ .if backwards ++ ldr r1, [S, #-4]! ++ mov r2, r0, lsl #32-align*8 ++ ldr r0, [S, #-4]! ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r1, r2} ++ .else ++ ldr r1, [S, #4]! ++ mov r0, r2, lsr #align*8 ++ ldr r2, [S, #4]! ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ stmia D!, {r0, r1} ++ .endif ++ .elseif words == 4 ++ .if backwards ++ ldmdb S!, {r2, r3} ++ mov r4, r0, lsl #32-align*8 ++ ldmdb S!, {r0, r1} ++ orr r4, r4, r3, lsr #align*8 ++ mov r3, r3, lsl #32-align*8 ++ orr r3, r3, r2, lsr #align*8 ++ mov r2, r2, lsl #32-align*8 ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r1, r2, r3, r4} ++ .else ++ ldmib S!, {r1, r2} ++ mov r0, r4, lsr #align*8 ++ ldmib S!, {r3, r4} ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ mov r2, r2, lsr #align*8 ++ orr r2, r2, r3, lsl #32-align*8 ++ mov r3, r3, lsr #align*8 ++ orr r3, r3, r4, lsl #32-align*8 ++ stmia D!, {r0, r1, r2, r3} ++ .endif ++ .elseif words == 8 ++ .if backwards ++ ldmdb S!, {r4, r5, r6, r7} ++ mov r8, r0, lsl #32-align*8 ++ ldmdb S!, {r0, r1, r2, r3} ++ .if use_pld ++ pld [S, OFF] ++ .endif ++ orr r8, r8, r7, lsr #align*8 ++ mov r7, r7, lsl #32-align*8 ++ orr r7, r7, r6, lsr #align*8 ++ mov r6, r6, lsl #32-align*8 ++ orr r6, r6, r5, lsr #align*8 ++ mov r5, r5, lsl #32-align*8 ++ orr r5, r5, r4, lsr #align*8 ++ mov r4, r4, lsl #32-align*8 ++ orr r4, r4, r3, lsr #align*8 ++ mov r3, r3, lsl #32-align*8 ++ orr r3, r3, r2, lsr #align*8 ++ mov r2, r2, lsl #32-align*8 ++ orr r2, r2, r1, lsr #align*8 ++ mov r1, r1, lsl #32-align*8 ++ orr r1, r1, r0, lsr #align*8 ++ stmdb D!, {r5, r6, r7, r8} ++ stmdb D!, {r1, r2, r3, r4} ++ .else ++ ldmib S!, {r1, r2, r3, r4} ++ mov r0, r8, lsr #align*8 ++ ldmib S!, {r5, r6, r7, r8} ++ .if use_pld ++ pld [S, OFF] ++ .endif ++ orr r0, r0, r1, lsl #32-align*8 ++ mov r1, r1, lsr #align*8 ++ orr r1, r1, r2, lsl #32-align*8 ++ mov r2, r2, lsr #align*8 ++ orr r2, r2, r3, lsl #32-align*8 ++ mov r3, r3, lsr #align*8 ++ orr r3, r3, r4, lsl #32-align*8 ++ mov r4, r4, lsr #align*8 ++ orr r4, r4, r5, lsl #32-align*8 ++ mov r5, r5, lsr #align*8 ++ orr r5, r5, r6, lsl #32-align*8 ++ mov r6, r6, lsr #align*8 ++ orr r6, r6, r7, lsl #32-align*8 ++ mov r7, r7, lsr #align*8 ++ orr r7, r7, r8, lsl #32-align*8 ++ stmia D!, {r0, r1, r2, r3} ++ stmia D!, {r4, r5, r6, r7} ++ .endif ++ .endif ++.endm ++ ++.macro memcpy_leading_15bytes backwards, align ++ movs DAT1, DAT2, lsl #31 ++ sub N, N, DAT2 ++ .if backwards ++ ldrmib DAT0, [S, #-1]! ++ ldrcsh DAT1, [S, #-2]! ++ strmib DAT0, [D, #-1]! ++ strcsh DAT1, [D, #-2]! ++ .else ++ ldrmib DAT0, [S], #1 ++ ldrcsh DAT1, [S], #2 ++ strmib DAT0, [D], #1 ++ strcsh DAT1, [D], #2 ++ .endif ++ movs DAT1, DAT2, lsl #29 ++ .if backwards ++ ldrmi DAT0, [S, #-4]! ++ .if align == 0 ++ ldmcsdb S!, {DAT1, DAT2} ++ .else ++ ldrcs DAT2, [S, #-4]! ++ ldrcs DAT1, [S, #-4]! ++ .endif ++ strmi DAT0, [D, #-4]! ++ stmcsdb D!, {DAT1, DAT2} ++ .else ++ ldrmi DAT0, [S], #4 ++ .if align == 0 ++ ldmcsia S!, {DAT1, DAT2} ++ .else ++ ldrcs DAT1, [S], #4 ++ ldrcs DAT2, [S], #4 ++ .endif ++ strmi DAT0, [D], #4 ++ stmcsia D!, {DAT1, DAT2} ++ .endif ++.endm ++ ++.macro memcpy_trailing_15bytes backwards, align ++ movs N, N, lsl #29 ++ .if backwards ++ .if align == 0 ++ ldmcsdb S!, {DAT0, DAT1} ++ .else ++ ldrcs DAT1, [S, #-4]! ++ ldrcs DAT0, [S, #-4]! ++ .endif ++ ldrmi DAT2, [S, #-4]! ++ stmcsdb D!, {DAT0, DAT1} ++ strmi DAT2, [D, #-4]! ++ .else ++ .if align == 0 ++ ldmcsia S!, {DAT0, DAT1} ++ .else ++ ldrcs DAT0, [S], #4 ++ ldrcs DAT1, [S], #4 ++ .endif ++ ldrmi DAT2, [S], #4 ++ stmcsia D!, {DAT0, DAT1} ++ strmi DAT2, [D], #4 ++ .endif ++ movs N, N, lsl #2 ++ .if backwards ++ ldrcsh DAT0, [S, #-2]! ++ ldrmib DAT1, [S, #-1] ++ strcsh DAT0, [D, #-2]! ++ strmib DAT1, [D, #-1] ++ .else ++ ldrcsh DAT0, [S], #2 ++ ldrmib DAT1, [S] ++ strcsh DAT0, [D], #2 ++ strmib DAT1, [D] ++ .endif ++.endm ++ ++.macro memcpy_long_inner_loop backwards, align ++ .if align != 0 ++ .if backwards ++ ldr DAT0, [S, #-align]! ++ .else ++ ldr LAST, [S, #-align]! ++ .endif ++ .endif ++110: ++ .if align == 0 ++ .if backwards ++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ pld [S, OFF] ++ stmdb D!, {DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT0, DAT1, DAT2, DAT3} ++ .else ++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ pld [S, OFF] ++ stmia D!, {DAT0, DAT1, DAT2, DAT3} ++ stmia D!, {DAT4, DAT5, DAT6, LAST} ++ .endif ++ .else ++ unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST ++ .endif ++ subs N, N, #32 ++ bhs 110b ++ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ ++ preload_trailing backwards, S, N, OFF ++ add N, N, #(prefetch_distance+2)*32 - 32 ++120: ++ .if align == 0 ++ .if backwards ++ ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT4, DAT5, DAT6, LAST} ++ stmdb D!, {DAT0, DAT1, DAT2, DAT3} ++ .else ++ ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST} ++ stmia D!, {DAT0, DAT1, DAT2, DAT3} ++ stmia D!, {DAT4, DAT5, DAT6, LAST} ++ .endif ++ .else ++ unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST ++ .endif ++ subs N, N, #32 ++ bhs 120b ++ tst N, #16 ++ .if align == 0 ++ .if backwards ++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST} ++ stmnedb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldmneia S!, {DAT0, DAT1, DAT2, LAST} ++ stmneia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ .else ++ beq 130f ++ unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST ++130: ++ .endif ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ .if align != 0 ++ add S, S, #align ++ .endif ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {DAT3, DAT4, DAT5, DAT6, DAT7} ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy_medium_inner_loop backwards, align ++120: ++ .if backwards ++ .if align == 0 ++ ldmdb S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldr LAST, [S, #-4]! ++ ldr DAT2, [S, #-4]! ++ ldr DAT1, [S, #-4]! ++ ldr DAT0, [S, #-4]! ++ .endif ++ stmdb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ .if align == 0 ++ ldmia S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldr DAT0, [S], #4 ++ ldr DAT1, [S], #4 ++ ldr DAT2, [S], #4 ++ ldr LAST, [S], #4 ++ .endif ++ stmia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ subs N, N, #16 ++ bhs 120b ++ /* Trailing words and bytes */ ++ tst N, #15 ++ beq 199f ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy_short_inner_loop backwards, align ++ tst N, #16 ++ .if backwards ++ .if align == 0 ++ ldmnedb S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldrne LAST, [S, #-4]! ++ ldrne DAT2, [S, #-4]! ++ ldrne DAT1, [S, #-4]! ++ ldrne DAT0, [S, #-4]! ++ .endif ++ stmnedb D!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ .if align == 0 ++ ldmneia S!, {DAT0, DAT1, DAT2, LAST} ++ .else ++ ldrne DAT0, [S], #4 ++ ldrne DAT1, [S], #4 ++ ldrne DAT2, [S], #4 ++ ldrne LAST, [S], #4 ++ .endif ++ stmneia D!, {DAT0, DAT1, DAT2, LAST} ++ .endif ++ memcpy_trailing_15bytes backwards, align ++199: ++ pop {D, DAT1, DAT2, pc} ++.endm ++ ++.macro memcpy backwards ++ D .req a1 ++ S .req a2 ++ N .req a3 ++ DAT0 .req a4 ++ DAT1 .req v1 ++ DAT2 .req v2 ++ DAT3 .req v3 ++ DAT4 .req v4 ++ DAT5 .req v5 ++ DAT6 .req v6 ++ DAT7 .req sl ++ LAST .req ip ++ OFF .req lr ++ ++ .cfi_startproc ++ ++ push {D, DAT1, DAT2, lr} ++ ++ .cfi_def_cfa_offset 16 ++ .cfi_rel_offset D, 0 ++ .cfi_undefined S ++ .cfi_undefined N ++ .cfi_undefined DAT0 ++ .cfi_rel_offset DAT1, 4 ++ .cfi_rel_offset DAT2, 8 ++ .cfi_undefined LAST ++ .cfi_rel_offset lr, 12 ++ ++ .if backwards ++ add D, D, N ++ add S, S, N ++ .endif ++ ++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ ++ cmp N, #31 ++ blo 170f ++ /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */ ++ cmp N, #(prefetch_distance+3)*32 - 1 ++ blo 160f ++ ++ /* Long case */ ++ push {DAT3, DAT4, DAT5, DAT6, DAT7} ++ ++ .cfi_def_cfa_offset 36 ++ .cfi_rel_offset D, 20 ++ .cfi_rel_offset DAT1, 24 ++ .cfi_rel_offset DAT2, 28 ++ .cfi_rel_offset DAT3, 0 ++ .cfi_rel_offset DAT4, 4 ++ .cfi_rel_offset DAT5, 8 ++ .cfi_rel_offset DAT6, 12 ++ .cfi_rel_offset DAT7, 16 ++ .cfi_rel_offset lr, 32 ++ ++ /* Adjust N so that the decrement instruction can also test for ++ * inner loop termination. We want it to stop when there are ++ * (prefetch_distance+1) complete blocks to go. */ ++ sub N, N, #(prefetch_distance+2)*32 ++ preload_leading_step1 backwards, DAT0, S ++ .if backwards ++ /* Bug in GAS: it accepts, but mis-assembles the instruction ++ * ands DAT2, D, #60, 2 ++ * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow) ++ */ ++ .word 0xE210513C ++ beq 154f ++ .else ++ ands DAT2, D, #15 ++ beq 154f ++ rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */ ++ .endif ++ preload_leading_step2 backwards, DAT0, S, DAT2, OFF ++ memcpy_leading_15bytes backwards, 1 ++154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */ ++ /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */ ++ .if backwards ++ rsb OFF, S, #3 ++ and OFF, OFF, #28 ++ sub OFF, OFF, #32*(prefetch_distance+1) ++ .else ++ and OFF, S, #28 ++ rsb OFF, OFF, #32*prefetch_distance ++ .endif ++ movs DAT0, S, lsl #31 ++ bhi 157f ++ bcs 156f ++ bmi 155f ++ memcpy_long_inner_loop backwards, 0 ++155: memcpy_long_inner_loop backwards, 1 ++156: memcpy_long_inner_loop backwards, 2 ++157: memcpy_long_inner_loop backwards, 3 ++ ++ .cfi_def_cfa_offset 16 ++ .cfi_rel_offset D, 0 ++ .cfi_rel_offset DAT1, 4 ++ .cfi_rel_offset DAT2, 8 ++ .cfi_same_value DAT3 ++ .cfi_same_value DAT4 ++ .cfi_same_value DAT5 ++ .cfi_same_value DAT6 ++ .cfi_same_value DAT7 ++ .cfi_rel_offset lr, 12 ++ ++160: /* Medium case */ ++ preload_all backwards, 0, 0, S, N, DAT2, OFF ++ sub N, N, #16 /* simplifies inner loop termination */ ++ .if backwards ++ ands DAT2, D, #15 ++ beq 164f ++ .else ++ ands DAT2, D, #15 ++ beq 164f ++ rsb DAT2, DAT2, #16 ++ .endif ++ memcpy_leading_15bytes backwards, align ++164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ ++ tst S, #3 ++ bne 140f ++ memcpy_medium_inner_loop backwards, 0 ++140: memcpy_medium_inner_loop backwards, 1 ++ ++170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */ ++ teq N, #0 ++ beq 199f ++ preload_all backwards, 1, 0, S, N, DAT2, LAST ++ tst D, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199f ++ .if backwards ++ ldrb DAT0, [S, #-1]! ++ strb DAT0, [D, #-1]! ++ .else ++ ldrb DAT0, [S], #1 ++ strb DAT0, [D], #1 ++ .endif ++ tst D, #3 ++ bne 172b ++174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ ++ tst S, #3 ++ bne 140f ++ memcpy_short_inner_loop backwards, 0 ++140: memcpy_short_inner_loop backwards, 1 ++ ++ .cfi_endproc ++ ++ .unreq D ++ .unreq S ++ .unreq N ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq DAT4 ++ .unreq DAT5 ++ .unreq DAT6 ++ .unreq DAT7 ++ .unreq LAST ++ .unreq OFF ++.endm +--- /dev/null ++++ b/arch/arm/lib/memmove_rpi.S +@@ -0,0 +1,61 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++#include "memcpymove.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memmove(void *s1, const void *s2, size_t n); ++ * On entry: ++ * a1 = pointer to destination ++ * a2 = pointer to source ++ * a3 = number of bytes to copy ++ * On exit: ++ * a1 preserved ++ */ ++ ++.set prefetch_distance, 3 ++ ++ENTRY(memmove) ++ cmp a2, a1 ++ bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */ ++ memcpy 1 ++ENDPROC(memmove) +--- /dev/null ++++ b/arch/arm/lib/memset_rpi.S +@@ -0,0 +1,128 @@ ++/* ++Copyright (c) 2013, Raspberry Pi Foundation ++Copyright (c) 2013, RISC OS Open Ltd ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include "arm-mem.h" ++ ++/* Prevent the stack from becoming executable */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++ .text ++ .arch armv6 ++ .object_arch armv4 ++ .arm ++ .altmacro ++ .p2align 2 ++ ++/* ++ * void *memset(void *s, int c, size_t n); ++ * On entry: ++ * a1 = pointer to buffer to fill ++ * a2 = byte pattern to fill with (caller-narrowed) ++ * a3 = number of bytes to fill ++ * On exit: ++ * a1 preserved ++ */ ++ENTRY(mmioset) ++ENTRY(memset) ++ENTRY(__memset32) ++ENTRY(__memset64) ++ ++ S .req a1 ++ DAT0 .req a2 ++ N .req a3 ++ DAT1 .req a4 ++ DAT2 .req ip ++ DAT3 .req lr ++ ++ orr DAT0, DAT0, DAT0, lsl #8 ++ push {S, lr} ++ orr DAT0, DAT0, DAT0, lsl #16 ++ mov DAT1, DAT0 ++ ++ /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */ ++ cmp N, #31 ++ blo 170f ++ ++161: sub N, N, #16 /* simplifies inner loop termination */ ++ /* Leading words and bytes */ ++ tst S, #15 ++ beq 164f ++ rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */ ++ movs DAT2, DAT3, lsl #31 ++ submi N, N, #1 ++ strmib DAT0, [S], #1 ++ subcs N, N, #2 ++ strcsh DAT0, [S], #2 ++ movs DAT2, DAT3, lsl #29 ++ submi N, N, #4 ++ strmi DAT0, [S], #4 ++ subcs N, N, #8 ++ stmcsia S!, {DAT0, DAT1} ++164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */ ++ mov DAT2, DAT0 ++ mov DAT3, DAT0 ++ /* Now the inner loop of 16-byte stores */ ++165: stmia S!, {DAT0, DAT1, DAT2, DAT3} ++ subs N, N, #16 ++ bhs 165b ++166: /* Trailing words and bytes */ ++ movs N, N, lsl #29 ++ stmcsia S!, {DAT0, DAT1} ++ strmi DAT0, [S], #4 ++ movs N, N, lsl #2 ++ strcsh DAT0, [S], #2 ++ strmib DAT0, [S] ++199: pop {S, pc} ++ ++170: /* Short case */ ++ mov DAT2, DAT0 ++ mov DAT3, DAT0 ++ tst S, #3 ++ beq 174f ++172: subs N, N, #1 ++ blo 199b ++ strb DAT0, [S], #1 ++ tst S, #3 ++ bne 172b ++174: tst N, #16 ++ stmneia S!, {DAT0, DAT1, DAT2, DAT3} ++ b 166b ++ ++ .unreq S ++ .unreq DAT0 ++ .unreq N ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ENDPROC(__memset64) ++ENDPROC(__memset32) ++ENDPROC(memset) ++ENDPROC(mmioset) +--- a/arch/arm/lib/uaccess_with_memcpy.c ++++ b/arch/arm/lib/uaccess_with_memcpy.c +@@ -19,6 +19,14 @@ + #include + #include + ++#ifndef COPY_FROM_USER_THRESHOLD ++#define COPY_FROM_USER_THRESHOLD 64 ++#endif ++ ++#ifndef COPY_TO_USER_THRESHOLD ++#define COPY_TO_USER_THRESHOLD 64 ++#endif ++ + static int + pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) + { +@@ -81,7 +89,44 @@ pin_page_for_write(const void __user *_a + return 1; + } + +-static unsigned long noinline ++static int ++pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) ++{ ++ unsigned long addr = (unsigned long)_addr; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ pud_t *pud; ++ spinlock_t *ptl; ++ ++ pgd = pgd_offset(current->mm, addr); ++ if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd))) ++ { ++ return 0; ++ } ++ pud = pud_offset(pgd, addr); ++ if (unlikely(pud_none(*pud) || pud_bad(*pud))) ++ { ++ return 0; ++ } ++ ++ pmd = pmd_offset(pud, addr); ++ if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) ++ return 0; ++ ++ pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); ++ if (unlikely(!pte_present(*pte) || !pte_young(*pte))) { ++ pte_unmap_unlock(pte, ptl); ++ return 0; ++ } ++ ++ *ptep = pte; ++ *ptlp = ptl; ++ ++ return 1; ++} ++ ++unsigned long noinline + __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) + { + unsigned long ua_flags; +@@ -134,6 +179,57 @@ out: + return n; + } + ++unsigned long noinline ++__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n) ++{ ++ unsigned long ua_flags; ++ int atomic; ++ ++ if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { ++ memcpy(to, (const void *)from, n); ++ return 0; ++ } ++ ++ /* the mmap semaphore is taken only if not in an atomic context */ ++ atomic = in_atomic(); ++ ++ if (!atomic) ++ down_read(¤t->mm->mmap_sem); ++ while (n) { ++ pte_t *pte; ++ spinlock_t *ptl; ++ int tocopy; ++ ++ while (!pin_page_for_read(from, &pte, &ptl)) { ++ char temp; ++ if (!atomic) ++ up_read(¤t->mm->mmap_sem); ++ if (__get_user(temp, (char __user *)from)) ++ goto out; ++ if (!atomic) ++ down_read(¤t->mm->mmap_sem); ++ } ++ ++ tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1; ++ if (tocopy > n) ++ tocopy = n; ++ ++ ua_flags = uaccess_save_and_enable(); ++ memcpy(to, (const void *)from, tocopy); ++ uaccess_restore(ua_flags); ++ to += tocopy; ++ from += tocopy; ++ n -= tocopy; ++ ++ pte_unmap_unlock(pte, ptl); ++ } ++ if (!atomic) ++ up_read(¤t->mm->mmap_sem); ++ ++out: ++ return n; ++} ++ + unsigned long + arm_copy_to_user(void __user *to, const void *from, unsigned long n) + { +@@ -144,7 +240,7 @@ arm_copy_to_user(void __user *to, const + * With frame pointer disabled, tail call optimization kicks in + * as well making this test almost invisible. + */ +- if (n < 64) { ++ if (n < COPY_TO_USER_THRESHOLD) { + unsigned long ua_flags = uaccess_save_and_enable(); + n = __copy_to_user_std(to, from, n); + uaccess_restore(ua_flags); +@@ -154,6 +250,26 @@ arm_copy_to_user(void __user *to, const + } + return n; + } ++ ++unsigned long __must_check ++arm_copy_from_user(void *to, const void __user *from, unsigned long n) ++{ ++ /* ++ * This test is stubbed out of the main function above to keep ++ * the overhead for small copies low by avoiding a large ++ * register dump on the stack just to reload them right away. ++ * With frame pointer disabled, tail call optimization kicks in ++ * as well making this test almost invisible. ++ */ ++ if (n < COPY_TO_USER_THRESHOLD) { ++ unsigned long ua_flags = uaccess_save_and_enable(); ++ n = __copy_from_user_std(to, from, n); ++ uaccess_restore(ua_flags); ++ } else { ++ n = __copy_from_user_memcpy(to, from, n); ++ } ++ return n; ++} + + static unsigned long noinline + __clear_user_memset(void __user *addr, unsigned long n) +--- a/arch/arm/mach-bcm/Kconfig ++++ b/arch/arm/mach-bcm/Kconfig +@@ -188,6 +188,13 @@ config ARCH_BCM_53573 + The base chip is BCM53573 and there are some packaging modifications + like BCM47189 and BCM47452. + ++config BCM2835_FAST_MEMCPY ++ bool "Enable optimized __copy_to_user and __copy_from_user" ++ depends on ARCH_BCM2835 && ARCH_MULTI_V6 ++ default y ++ help ++ Optimized versions of __copy_to_user and __copy_from_user for Pi1. ++ + config ARCH_BCM_63XX + bool "Broadcom BCM63xx DSL SoC" + depends on ARCH_MULTI_V7 -- cgit v1.2.3