OpenWrt – Rev 4

Subversion Repositories:
?pathlinks?
Blame – Last modification – View Log – Download
From ebd731dd71ec9728a5a87ec1cd695be15828c32c Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Mon, 28 Nov 2016 16:50:04 +0000
Subject: [PATCH] Improve __copy_to_user and __copy_from_user performance

Provide a __copy_from_user that uses memcpy. On BCM2708, use
optimised memcpy/memmove/memcmp/memset implementations.

arch/arm: Add mmiocpy/set aliases for memcpy/set

See: https://github.com/raspberrypi/linux/issues/1082

copy_from_user: CPU_SW_DOMAIN_PAN compatibility

The downstream copy_from_user acceleration must also play nice with
CONFIG_CPU_SW_DOMAIN_PAN.

See: https://github.com/raspberrypi/linux/issues/1381

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 arch/arm/include/asm/string.h      |   5 +
 arch/arm/include/asm/uaccess.h     |   3 +
 arch/arm/lib/Makefile              |  15 +-
 arch/arm/lib/arm-mem.h             | 159 ++++++++++++
 arch/arm/lib/copy_from_user.S      |   4 +-
 arch/arm/lib/exports_rpi.c         |  37 +++
 arch/arm/lib/memcmp_rpi.S          | 285 +++++++++++++++++++++
 arch/arm/lib/memcpy_rpi.S          |  61 +++++
 arch/arm/lib/memcpymove.h          | 506 +++++++++++++++++++++++++++++++++++++
 arch/arm/lib/memmove_rpi.S         |  61 +++++
 arch/arm/lib/memset_rpi.S          | 123 +++++++++
 arch/arm/lib/uaccess_with_memcpy.c | 120 ++++++++-
 arch/arm/mach-bcm/Kconfig          |   7 +
 13 files changed, 1380 insertions(+), 6 deletions(-)
 create mode 100644 arch/arm/lib/arm-mem.h
 create mode 100644 arch/arm/lib/exports_rpi.c
 create mode 100644 arch/arm/lib/memcmp_rpi.S
 create mode 100644 arch/arm/lib/memcpy_rpi.S
 create mode 100644 arch/arm/lib/memcpymove.h
 create mode 100644 arch/arm/lib/memmove_rpi.S
 create mode 100644 arch/arm/lib/memset_rpi.S

--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
 #define __HAVE_ARCH_MEMSET
 extern void * memset(void *, int, __kernel_size_t);
 
+#ifdef CONFIG_BCM2835_FAST_MEMCPY
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
+#endif
+
 extern void __memzero(void *ptr, __kernel_size_t n);
 
 #define memset(p,v,n)                                                  \
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -477,6 +477,9 @@ do {                                                                        \
 extern unsigned long __must_check
 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
 
+extern unsigned long __must_check
+__copy_from_user_std(void *to, const void __user *from, unsigned long n);
+
 static inline unsigned long __must_check
 __arch_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -6,9 +6,8 @@
 
 lib-y          := backtrace.o changebit.o csumipv6.o csumpartial.o   \
                   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
-                  delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
-                  memmove.o memset.o memzero.o setbit.o              \
-                  strchr.o strrchr.o                                 \
+                  delay.o delay-loop.o findbit.o memchr.o memzero.o  \
+                  setbit.o strchr.o strrchr.o                        \
                   testchangebit.o testclearbit.o testsetbit.o        \
                   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
                   ucmpdi2.o lib1funcs.o div64.o                      \
@@ -18,6 +17,16 @@ lib-y                := backtrace.o changebit.o csumip
 mmu-y          := clear_user.o copy_page.o getuser.o putuser.o       \
                   copy_from_user.o copy_to_user.o
 
+# Choose optimised implementations for Raspberry Pi
+ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
+  CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
+  CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
+  obj-$(CONFIG_MODULES) += exports_rpi.o
+  lib-y        += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
+else
+  lib-y        += memcpy.o memmove.o memset.o
+endif
+
 # using lib_ here won't override already available weak symbols
 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
 
--- /dev/null
+++ b/arch/arm/lib/arm-mem.h
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+.macro myfunc fname
+ .func fname
+ .global fname
+fname:
+.endm
+
+.macro preload_leading_step1  backwards, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if backwards
+        sub     ptr, base, #1
+        bic     ptr, ptr, #31
+ .else
+        bic     ptr, base, #31
+ .endif
+ .set OFFSET, 0
+ .rept prefetch_distance+1
+        pld     [ptr, #OFFSET]
+  .if backwards
+   .set OFFSET, OFFSET-32
+  .else
+   .set OFFSET, OFFSET+32
+  .endif
+ .endr
+.endm
+
+.macro preload_leading_step2  backwards, ptr, base, leading_bytes, tmp
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload one more cache line than that. The question we need to ask is:
+ * are the leading bytes more than the amount by which the source
+ * pointer will be rounded down for preloading, and if so, by how many
+ * cache lines?
+ */
+ .if backwards
+/* Here we compare against how many bytes we are into the
+ * cache line, counting down from the highest such address.
+ * Effectively, we want to calculate
+ *     leading_bytes = dst&15
+ *     cacheline_offset = 31-((src-leading_bytes-1)&31)
+ *     extra_needed = leading_bytes - cacheline_offset
+ * and test if extra_needed is <= 0, or rearranging:
+ *     leading_bytes + (src-leading_bytes-1)&31 <= 31
+ */
+        mov     tmp, base, lsl #32-5
+        sbc     tmp, tmp, leading_bytes, lsl #32-5
+        adds    tmp, tmp, leading_bytes, lsl #32-5
+        bcc     61f
+        pld     [ptr, #-32*(prefetch_distance+1)]
+ .else
+/* Effectively, we want to calculate
+ *     leading_bytes = (-dst)&15
+ *     cacheline_offset = (src+leading_bytes)&31
+ *     extra_needed = leading_bytes - cacheline_offset
+ * and test if extra_needed is <= 0.
+ */
+        mov     tmp, base, lsl #32-5
+        add     tmp, tmp, leading_bytes, lsl #32-5
+        rsbs    tmp, tmp, leading_bytes, lsl #32-5
+        bls     61f
+        pld     [ptr, #32*(prefetch_distance+1)]
+ .endif
+61:
+.endm
+
+.macro preload_trailing  backwards, base, remain, tmp
+        /* We need either 0, 1 or 2 extra preloads */
+ .if backwards
+        rsb     tmp, base, #0
+        mov     tmp, tmp, lsl #32-5
+ .else
+        mov     tmp, base, lsl #32-5
+ .endif
+        adds    tmp, tmp, remain, lsl #32-5
+        adceqs  tmp, tmp, #0
+        /* The instruction above has two effects: ensures Z is only
+         * set if C was clear (so Z indicates that both shifted quantities
+         * were 0), and clears C if Z was set (so C indicates that the sum
+         * of the shifted quantities was greater and not equal to 32) */
+        beq     82f
+ .if backwards
+        sub     tmp, base, #1
+        bic     tmp, tmp, #31
+ .else
+        bic     tmp, base, #31
+ .endif
+        bcc     81f
+ .if backwards
+        pld     [tmp, #-32*(prefetch_distance+1)]
+81:
+        pld     [tmp, #-32*prefetch_distance]
+ .else
+        pld     [tmp, #32*(prefetch_distance+2)]
+81:
+        pld     [tmp, #32*(prefetch_distance+1)]
+ .endif
+82:
+.endm
+
+.macro preload_all    backwards, narrow_case, shift, base, remain, tmp0, tmp1
+ .if backwards
+        sub     tmp0, base, #1
+        bic     tmp0, tmp0, #31
+        pld     [tmp0]
+        sub     tmp1, base, remain, lsl #shift
+ .else
+        bic     tmp0, base, #31
+        pld     [tmp0]
+        add     tmp1, base, remain, lsl #shift
+        sub     tmp1, tmp1, #1
+ .endif
+        bic     tmp1, tmp1, #31
+        cmp     tmp1, tmp0
+        beq     92f
+ .if narrow_case
+        /* In this case, all the data fits in either 1 or 2 cache lines */
+        pld     [tmp1]
+ .else
+91:
+  .if backwards
+        sub     tmp0, tmp0, #32
+  .else
+        add     tmp0, tmp0, #32
+  .endif
+        cmp     tmp0, tmp1
+        pld     [tmp0]
+        bne     91b
+ .endif
+92:
+.endm
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -89,11 +89,13 @@
 
        .text
 
-ENTRY(arm_copy_from_user)
+ENTRY(__copy_from_user_std)
+WEAK(arm_copy_from_user)
 
 #include "copy_template.S"
 
 ENDPROC(arm_copy_from_user)
+ENDPROC(__copy_from_user_std)
 
        .pushsection .fixup,"ax"
        .align 0
--- /dev/null
+++ b/arch/arm/lib/exports_rpi.c
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the above-listed copyright holders may not be used
+ *    to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2, as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+EXPORT_SYMBOL(memcmp);
--- /dev/null
+++ b/arch/arm/lib/memcmp_rpi.S
@@ -0,0 +1,285 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <linux/linkage.h>
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .arch armv6
+    .object_arch armv4
+    .arm
+    .altmacro
+    .p2align 2
+
+.macro memcmp_process_head  unaligned
+ .if unaligned
+        ldr     DAT0, [S_1], #4
+        ldr     DAT1, [S_1], #4
+        ldr     DAT2, [S_1], #4
+        ldr     DAT3, [S_1], #4
+ .else
+        ldmia   S_1!, {DAT0, DAT1, DAT2, DAT3}
+ .endif
+        ldmia   S_2!, {DAT4, DAT5, DAT6, DAT7}
+.endm
+
+.macro memcmp_process_tail
+        cmp     DAT0, DAT4
+        cmpeq   DAT1, DAT5
+        cmpeq   DAT2, DAT6
+        cmpeq   DAT3, DAT7
+        bne     200f
+.endm
+
+.macro memcmp_leading_31bytes
+        movs    DAT0, OFF, lsl #31
+        ldrmib  DAT0, [S_1], #1
+        ldrcsh  DAT1, [S_1], #2
+        ldrmib  DAT4, [S_2], #1
+        ldrcsh  DAT5, [S_2], #2
+        movpl   DAT0, #0
+        movcc   DAT1, #0
+        movpl   DAT4, #0
+        movcc   DAT5, #0
+        submi   N, N, #1
+        subcs   N, N, #2
+        cmp     DAT0, DAT4
+        cmpeq   DAT1, DAT5
+        bne     200f
+        movs    DAT0, OFF, lsl #29
+        ldrmi   DAT0, [S_1], #4
+        ldrcs   DAT1, [S_1], #4
+        ldrcs   DAT2, [S_1], #4
+        ldrmi   DAT4, [S_2], #4
+        ldmcsia S_2!, {DAT5, DAT6}
+        movpl   DAT0, #0
+        movcc   DAT1, #0
+        movcc   DAT2, #0
+        movpl   DAT4, #0
+        movcc   DAT5, #0
+        movcc   DAT6, #0
+        submi   N, N, #4
+        subcs   N, N, #8
+        cmp     DAT0, DAT4
+        cmpeq   DAT1, DAT5
+        cmpeq   DAT2, DAT6
+        bne     200f
+        tst     OFF, #16
+        beq     105f
+        memcmp_process_head  1
+        sub     N, N, #16
+        memcmp_process_tail
+105:
+.endm
+
+.macro memcmp_trailing_15bytes  unaligned
+        movs    N, N, lsl #29
+ .if unaligned
+        ldrcs   DAT0, [S_1], #4
+        ldrcs   DAT1, [S_1], #4
+ .else
+        ldmcsia S_1!, {DAT0, DAT1}
+ .endif
+        ldrmi   DAT2, [S_1], #4
+        ldmcsia S_2!, {DAT4, DAT5}
+        ldrmi   DAT6, [S_2], #4
+        movcc   DAT0, #0
+        movcc   DAT1, #0
+        movpl   DAT2, #0
+        movcc   DAT4, #0
+        movcc   DAT5, #0
+        movpl   DAT6, #0
+        cmp     DAT0, DAT4
+        cmpeq   DAT1, DAT5
+        cmpeq   DAT2, DAT6
+        bne     200f
+        movs    N, N, lsl #2
+        ldrcsh  DAT0, [S_1], #2
+        ldrmib  DAT1, [S_1]
+        ldrcsh  DAT4, [S_2], #2
+        ldrmib  DAT5, [S_2]
+        movcc   DAT0, #0
+        movpl   DAT1, #0
+        movcc   DAT4, #0
+        movpl   DAT5, #0
+        cmp     DAT0, DAT4
+        cmpeq   DAT1, DAT5
+        bne     200f
+.endm
+
+.macro memcmp_long_inner_loop  unaligned
+110:
+        memcmp_process_head  unaligned
+        pld     [S_2, #prefetch_distance*32 + 16]
+        memcmp_process_tail
+        memcmp_process_head  unaligned
+        pld     [S_1, OFF]
+        memcmp_process_tail
+        subs    N, N, #32
+        bhs     110b
+        /* Just before the final (prefetch_distance+1) 32-byte blocks,
+         * deal with final preloads */
+        preload_trailing  0, S_1, N, DAT0
+        preload_trailing  0, S_2, N, DAT0
+        add     N, N, #(prefetch_distance+2)*32 - 16
+120:
+        memcmp_process_head  unaligned
+        memcmp_process_tail
+        subs    N, N, #16
+        bhs     120b
+        /* Trailing words and bytes */
+        tst     N, #15
+        beq     199f
+        memcmp_trailing_15bytes  unaligned
+199:    /* Reached end without detecting a difference */
+        mov     a1, #0
+        setend  le
+        pop     {DAT1-DAT6, pc}
+.endm
+
+.macro memcmp_short_inner_loop  unaligned
+        subs    N, N, #16     /* simplifies inner loop termination */
+        blo     122f
+120:
+        memcmp_process_head  unaligned
+        memcmp_process_tail
+        subs    N, N, #16
+        bhs     120b
+122:    /* Trailing words and bytes */
+        tst     N, #15
+        beq     199f
+        memcmp_trailing_15bytes  unaligned
+199:    /* Reached end without detecting a difference */
+        mov     a1, #0
+        setend  le
+        pop     {DAT1-DAT6, pc}
+.endm
+
+/*
+ * int memcmp(const void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to buffer 1
+ * a2 = pointer to buffer 2
+ * a3 = number of bytes to compare (as unsigned chars)
+ * On exit:
+ * a1 = >0/=0/<0 if s1 >/=/< s2
+ */
+
+.set prefetch_distance, 2
+
+ENTRY(memcmp)
+        S_1     .req    a1
+        S_2     .req    a2
+        N       .req    a3
+        DAT0    .req    a4
+        DAT1    .req    v1
+        DAT2    .req    v2
+        DAT3    .req    v3
+        DAT4    .req    v4
+        DAT5    .req    v5
+        DAT6    .req    v6
+        DAT7    .req    ip
+        OFF     .req    lr
+
+        push    {DAT1-DAT6, lr}
+        setend  be /* lowest-addressed bytes are most significant */
+
+        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
+        cmp     N, #(prefetch_distance+3)*32 - 1
+        blo     170f
+
+        /* Long case */
+        /* Adjust N so that the decrement instruction can also test for
+         * inner loop termination. We want it to stop when there are
+         * (prefetch_distance+1) complete blocks to go. */
+        sub     N, N, #(prefetch_distance+2)*32
+        preload_leading_step1  0, DAT0, S_1
+        preload_leading_step1  0, DAT1, S_2
+        tst     S_2, #31
+        beq     154f
+        rsb     OFF, S_2, #0 /* no need to AND with 15 here */
+        preload_leading_step2  0, DAT0, S_1, OFF, DAT2
+        preload_leading_step2  0, DAT1, S_2, OFF, DAT2
+        memcmp_leading_31bytes
+154:    /* Second source now cacheline (32-byte) aligned; we have at
+         * least one prefetch to go. */
+        /* Prefetch offset is best selected such that it lies in the
+         * first 8 of each 32 bytes - but it's just as easy to aim for
+         * the first one */
+        and     OFF, S_1, #31
+        rsb     OFF, OFF, #32*prefetch_distance
+        tst     S_1, #3
+        bne     140f
+        memcmp_long_inner_loop  0
+140:    memcmp_long_inner_loop  1
+
+170:    /* Short case */
+        teq     N, #0
+        beq     199f
+        preload_all 0, 0, 0, S_1, N, DAT0, DAT1
+        preload_all 0, 0, 0, S_2, N, DAT0, DAT1
+        tst     S_2, #3
+        beq     174f
+172:    subs    N, N, #1
+        blo     199f
+        ldrb    DAT0, [S_1], #1
+        ldrb    DAT4, [S_2], #1
+        cmp     DAT0, DAT4
+        bne     200f
+        tst     S_2, #3
+        bne     172b
+174:    /* Second source now 4-byte aligned; we have 0 or more bytes to go */
+        tst     S_1, #3
+        bne     140f
+        memcmp_short_inner_loop  0
+140:    memcmp_short_inner_loop  1
+
+200:    /* Difference found: determine sign. */
+        movhi   a1, #1
+        movlo   a1, #-1
+        setend  le
+        pop     {DAT1-DAT6, pc}
+
+        .unreq  S_1
+        .unreq  S_2
+        .unreq  N
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  DAT4
+        .unreq  DAT5
+        .unreq  DAT6
+        .unreq  DAT7
+        .unreq  OFF
+ENDPROC(memcmp)
--- /dev/null
+++ b/arch/arm/lib/memcpy_rpi.S
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <linux/linkage.h>
+#include "arm-mem.h"
+#include "memcpymove.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .arch armv6
+    .object_arch armv4
+    .arm
+    .altmacro
+    .p2align 2
+
+/*
+ * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 3
+
+ENTRY(mmiocpy)
+ENTRY(memcpy)
+        memcpy  0
+ENDPROC(memcpy)
+ENDPROC(mmiocpy)
--- /dev/null
+++ b/arch/arm/lib/memcpymove.h
@@ -0,0 +1,506 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+.macro unaligned_words  backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
+ .if words == 1
+  .if backwards
+        mov     r1, r0, lsl #32-align*8
+        ldr     r0, [S, #-4]!
+        orr     r1, r1, r0, lsr #align*8
+        str     r1, [D, #-4]!
+  .else
+        mov     r0, r1, lsr #align*8
+        ldr     r1, [S, #4]!
+        orr     r0, r0, r1, lsl #32-align*8
+        str     r0, [D], #4
+  .endif
+ .elseif words == 2
+  .if backwards
+        ldr     r1, [S, #-4]!
+        mov     r2, r0, lsl #32-align*8
+        ldr     r0, [S, #-4]!
+        orr     r2, r2, r1, lsr #align*8
+        mov     r1, r1, lsl #32-align*8
+        orr     r1, r1, r0, lsr #align*8
+        stmdb   D!, {r1, r2}
+  .else
+        ldr     r1, [S, #4]!
+        mov     r0, r2, lsr #align*8
+        ldr     r2, [S, #4]!
+        orr     r0, r0, r1, lsl #32-align*8
+        mov     r1, r1, lsr #align*8
+        orr     r1, r1, r2, lsl #32-align*8
+        stmia   D!, {r0, r1}
+  .endif
+ .elseif words == 4
+  .if backwards
+        ldmdb   S!, {r2, r3}
+        mov     r4, r0, lsl #32-align*8
+        ldmdb   S!, {r0, r1}
+        orr     r4, r4, r3, lsr #align*8
+        mov     r3, r3, lsl #32-align*8
+        orr     r3, r3, r2, lsr #align*8
+        mov     r2, r2, lsl #32-align*8
+        orr     r2, r2, r1, lsr #align*8
+        mov     r1, r1, lsl #32-align*8
+        orr     r1, r1, r0, lsr #align*8
+        stmdb   D!, {r1, r2, r3, r4}
+  .else
+        ldmib   S!, {r1, r2}
+        mov     r0, r4, lsr #align*8
+        ldmib   S!, {r3, r4}
+        orr     r0, r0, r1, lsl #32-align*8
+        mov     r1, r1, lsr #align*8
+        orr     r1, r1, r2, lsl #32-align*8
+        mov     r2, r2, lsr #align*8
+        orr     r2, r2, r3, lsl #32-align*8
+        mov     r3, r3, lsr #align*8
+        orr     r3, r3, r4, lsl #32-align*8
+        stmia   D!, {r0, r1, r2, r3}
+  .endif
+ .elseif words == 8
+  .if backwards
+        ldmdb   S!, {r4, r5, r6, r7}
+        mov     r8, r0, lsl #32-align*8
+        ldmdb   S!, {r0, r1, r2, r3}
+   .if use_pld
+        pld     [S, OFF]
+   .endif
+        orr     r8, r8, r7, lsr #align*8
+        mov     r7, r7, lsl #32-align*8
+        orr     r7, r7, r6, lsr #align*8
+        mov     r6, r6, lsl #32-align*8
+        orr     r6, r6, r5, lsr #align*8
+        mov     r5, r5, lsl #32-align*8
+        orr     r5, r5, r4, lsr #align*8
+        mov     r4, r4, lsl #32-align*8
+        orr     r4, r4, r3, lsr #align*8
+        mov     r3, r3, lsl #32-align*8
+        orr     r3, r3, r2, lsr #align*8
+        mov     r2, r2, lsl #32-align*8
+        orr     r2, r2, r1, lsr #align*8
+        mov     r1, r1, lsl #32-align*8
+        orr     r1, r1, r0, lsr #align*8
+        stmdb   D!, {r5, r6, r7, r8}
+        stmdb   D!, {r1, r2, r3, r4}
+  .else
+        ldmib   S!, {r1, r2, r3, r4}
+        mov     r0, r8, lsr #align*8
+        ldmib   S!, {r5, r6, r7, r8}
+   .if use_pld
+        pld     [S, OFF]
+   .endif
+        orr     r0, r0, r1, lsl #32-align*8
+        mov     r1, r1, lsr #align*8
+        orr     r1, r1, r2, lsl #32-align*8
+        mov     r2, r2, lsr #align*8
+        orr     r2, r2, r3, lsl #32-align*8
+        mov     r3, r3, lsr #align*8
+        orr     r3, r3, r4, lsl #32-align*8
+        mov     r4, r4, lsr #align*8
+        orr     r4, r4, r5, lsl #32-align*8
+        mov     r5, r5, lsr #align*8
+        orr     r5, r5, r6, lsl #32-align*8
+        mov     r6, r6, lsr #align*8
+        orr     r6, r6, r7, lsl #32-align*8
+        mov     r7, r7, lsr #align*8
+        orr     r7, r7, r8, lsl #32-align*8
+        stmia   D!, {r0, r1, r2, r3}
+        stmia   D!, {r4, r5, r6, r7}
+  .endif
+ .endif
+.endm
+
+.macro memcpy_leading_15bytes  backwards, align
+        movs    DAT1, DAT2, lsl #31
+        sub     N, N, DAT2
+ .if backwards
+        ldrmib  DAT0, [S, #-1]!
+        ldrcsh  DAT1, [S, #-2]!
+        strmib  DAT0, [D, #-1]!
+        strcsh  DAT1, [D, #-2]!
+ .else
+        ldrmib  DAT0, [S], #1
+        ldrcsh  DAT1, [S], #2
+        strmib  DAT0, [D], #1
+        strcsh  DAT1, [D], #2
+ .endif
+        movs    DAT1, DAT2, lsl #29
+ .if backwards
+        ldrmi   DAT0, [S, #-4]!
+  .if align == 0
+        ldmcsdb S!, {DAT1, DAT2}
+  .else
+        ldrcs   DAT2, [S, #-4]!
+        ldrcs   DAT1, [S, #-4]!
+  .endif
+        strmi   DAT0, [D, #-4]!
+        stmcsdb D!, {DAT1, DAT2}
+ .else
+        ldrmi   DAT0, [S], #4
+  .if align == 0
+        ldmcsia S!, {DAT1, DAT2}
+  .else
+        ldrcs   DAT1, [S], #4
+        ldrcs   DAT2, [S], #4
+  .endif
+        strmi   DAT0, [D], #4
+        stmcsia D!, {DAT1, DAT2}
+ .endif
+.endm
+
+.macro memcpy_trailing_15bytes  backwards, align
+        movs    N, N, lsl #29
+ .if backwards
+  .if align == 0
+        ldmcsdb S!, {DAT0, DAT1}
+  .else
+        ldrcs   DAT1, [S, #-4]!
+        ldrcs   DAT0, [S, #-4]!
+  .endif
+        ldrmi   DAT2, [S, #-4]!
+        stmcsdb D!, {DAT0, DAT1}
+        strmi   DAT2, [D, #-4]!
+ .else
+  .if align == 0
+        ldmcsia S!, {DAT0, DAT1}
+  .else
+        ldrcs   DAT0, [S], #4
+        ldrcs   DAT1, [S], #4
+  .endif
+        ldrmi   DAT2, [S], #4
+        stmcsia D!, {DAT0, DAT1}
+        strmi   DAT2, [D], #4
+ .endif
+        movs    N, N, lsl #2
+ .if backwards
+        ldrcsh  DAT0, [S, #-2]!
+        ldrmib  DAT1, [S, #-1]
+        strcsh  DAT0, [D, #-2]!
+        strmib  DAT1, [D, #-1]
+ .else
+        ldrcsh  DAT0, [S], #2
+        ldrmib  DAT1, [S]
+        strcsh  DAT0, [D], #2
+        strmib  DAT1, [D]
+ .endif
+.endm
+
+.macro memcpy_long_inner_loop  backwards, align
+ .if align != 0
+  .if backwards
+        ldr     DAT0, [S, #-align]!
+  .else
+        ldr     LAST, [S, #-align]!
+  .endif
+ .endif
+110:
+ .if align == 0
+  .if backwards
+        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+        pld     [S, OFF]
+        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
+        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
+  .else
+        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+        pld     [S, OFF]
+        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
+        stmia   D!, {DAT4, DAT5, DAT6, LAST}
+  .endif
+ .else
+        unaligned_words  backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
+ .endif
+        subs    N, N, #32
+        bhs     110b
+        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+        preload_trailing  backwards, S, N, OFF
+        add     N, N, #(prefetch_distance+2)*32 - 32
+120:
+ .if align == 0
+  .if backwards
+        ldmdb   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+        stmdb   D!, {DAT4, DAT5, DAT6, LAST}
+        stmdb   D!, {DAT0, DAT1, DAT2, DAT3}
+  .else
+        ldmia   S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
+        stmia   D!, {DAT0, DAT1, DAT2, DAT3}
+        stmia   D!, {DAT4, DAT5, DAT6, LAST}
+  .endif
+ .else
+        unaligned_words  backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
+ .endif
+        subs    N, N, #32
+        bhs     120b
+        tst     N, #16
+ .if align == 0
+  .if backwards
+        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
+        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
+  .else
+        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
+        stmneia D!, {DAT0, DAT1, DAT2, LAST}
+  .endif
+ .else
+        beq     130f
+        unaligned_words  backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
+130:
+ .endif
+        /* Trailing words and bytes */
+        tst      N, #15
+        beq      199f
+ .if align != 0
+        add     S, S, #align
+ .endif
+        memcpy_trailing_15bytes  backwards, align
+199:
+        pop     {DAT3, DAT4, DAT5, DAT6, DAT7}
+        pop     {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy_medium_inner_loop  backwards, align
+120:
+ .if backwards
+  .if align == 0
+        ldmdb   S!, {DAT0, DAT1, DAT2, LAST}
+  .else
+        ldr     LAST, [S, #-4]!
+        ldr     DAT2, [S, #-4]!
+        ldr     DAT1, [S, #-4]!
+        ldr     DAT0, [S, #-4]!
+  .endif
+        stmdb   D!, {DAT0, DAT1, DAT2, LAST}
+ .else
+  .if align == 0
+        ldmia   S!, {DAT0, DAT1, DAT2, LAST}
+  .else
+        ldr     DAT0, [S], #4
+        ldr     DAT1, [S], #4
+        ldr     DAT2, [S], #4
+        ldr     LAST, [S], #4
+  .endif
+        stmia   D!, {DAT0, DAT1, DAT2, LAST}
+ .endif
+        subs     N, N, #16
+        bhs      120b
+        /* Trailing words and bytes */
+        tst      N, #15
+        beq      199f
+        memcpy_trailing_15bytes  backwards, align
+199:
+        pop     {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy_short_inner_loop  backwards, align
+        tst     N, #16
+ .if backwards
+  .if align == 0
+        ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
+  .else
+        ldrne   LAST, [S, #-4]!
+        ldrne   DAT2, [S, #-4]!
+        ldrne   DAT1, [S, #-4]!
+        ldrne   DAT0, [S, #-4]!
+  .endif
+        stmnedb D!, {DAT0, DAT1, DAT2, LAST}
+ .else
+  .if align == 0
+        ldmneia S!, {DAT0, DAT1, DAT2, LAST}
+  .else
+        ldrne   DAT0, [S], #4
+        ldrne   DAT1, [S], #4
+        ldrne   DAT2, [S], #4
+        ldrne   LAST, [S], #4
+  .endif
+        stmneia D!, {DAT0, DAT1, DAT2, LAST}
+ .endif
+        memcpy_trailing_15bytes  backwards, align
+199:
+        pop     {D, DAT1, DAT2, pc}
+.endm
+
+.macro memcpy backwards
+        D       .req    a1
+        S       .req    a2
+        N       .req    a3
+        DAT0    .req    a4
+        DAT1    .req    v1
+        DAT2    .req    v2
+        DAT3    .req    v3
+        DAT4    .req    v4
+        DAT5    .req    v5
+        DAT6    .req    v6
+        DAT7    .req    sl
+        LAST    .req    ip
+        OFF     .req    lr
+
+        .cfi_startproc
+
+        push    {D, DAT1, DAT2, lr}
+
+        .cfi_def_cfa_offset 16
+        .cfi_rel_offset D, 0
+        .cfi_undefined  S
+        .cfi_undefined  N
+        .cfi_undefined  DAT0
+        .cfi_rel_offset DAT1, 4
+        .cfi_rel_offset DAT2, 8
+        .cfi_undefined  LAST
+        .cfi_rel_offset lr, 12
+
+ .if backwards
+        add     D, D, N
+        add     S, S, N
+ .endif
+
+        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
+        cmp     N, #31
+        blo     170f
+        /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
+        cmp     N, #(prefetch_distance+3)*32 - 1
+        blo     160f
+
+        /* Long case */
+        push    {DAT3, DAT4, DAT5, DAT6, DAT7}
+
+        .cfi_def_cfa_offset 36
+        .cfi_rel_offset D, 20
+        .cfi_rel_offset DAT1, 24
+        .cfi_rel_offset DAT2, 28
+        .cfi_rel_offset DAT3, 0
+        .cfi_rel_offset DAT4, 4
+        .cfi_rel_offset DAT5, 8
+        .cfi_rel_offset DAT6, 12
+        .cfi_rel_offset DAT7, 16
+        .cfi_rel_offset lr, 32
+
+        /* Adjust N so that the decrement instruction can also test for
+         * inner loop termination. We want it to stop when there are
+         * (prefetch_distance+1) complete blocks to go. */
+        sub     N, N, #(prefetch_distance+2)*32
+        preload_leading_step1  backwards, DAT0, S
+ .if backwards
+        /* Bug in GAS: it accepts, but mis-assembles the instruction
+         * ands    DAT2, D, #60, 2
+         * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
+         */
+        .word   0xE210513C
+        beq     154f
+ .else
+        ands    DAT2, D, #15
+        beq     154f
+        rsb     DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
+ .endif
+        preload_leading_step2  backwards, DAT0, S, DAT2, OFF
+        memcpy_leading_15bytes backwards, 1
+154:    /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
+        /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
+ .if backwards
+        rsb     OFF, S, #3
+        and     OFF, OFF, #28
+        sub     OFF, OFF, #32*(prefetch_distance+1)
+ .else
+        and     OFF, S, #28
+        rsb     OFF, OFF, #32*prefetch_distance
+ .endif
+        movs    DAT0, S, lsl #31
+        bhi     157f
+        bcs     156f
+        bmi     155f
+        memcpy_long_inner_loop  backwards, 0
+155:    memcpy_long_inner_loop  backwards, 1
+156:    memcpy_long_inner_loop  backwards, 2
+157:    memcpy_long_inner_loop  backwards, 3
+
+        .cfi_def_cfa_offset 16
+        .cfi_rel_offset D, 0
+        .cfi_rel_offset DAT1, 4
+        .cfi_rel_offset DAT2, 8
+        .cfi_same_value DAT3
+        .cfi_same_value DAT4
+        .cfi_same_value DAT5
+        .cfi_same_value DAT6
+        .cfi_same_value DAT7
+        .cfi_rel_offset lr, 12
+
+160:    /* Medium case */
+        preload_all  backwards, 0, 0, S, N, DAT2, OFF
+        sub     N, N, #16     /* simplifies inner loop termination */
+ .if backwards
+        ands    DAT2, D, #15
+        beq     164f
+ .else
+        ands    DAT2, D, #15
+        beq     164f
+        rsb     DAT2, DAT2, #16
+ .endif
+        memcpy_leading_15bytes backwards, align
+164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+        tst     S, #3
+        bne     140f
+        memcpy_medium_inner_loop  backwards, 0
+140:    memcpy_medium_inner_loop  backwards, 1
+
+170:    /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+        teq     N, #0
+        beq     199f
+        preload_all  backwards, 1, 0, S, N, DAT2, LAST
+        tst     D, #3
+        beq     174f
+172:    subs    N, N, #1
+        blo     199f
+ .if backwards
+        ldrb    DAT0, [S, #-1]!
+        strb    DAT0, [D, #-1]!
+ .else
+        ldrb    DAT0, [S], #1
+        strb    DAT0, [D], #1
+ .endif
+        tst     D, #3
+        bne     172b
+174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+        tst     S, #3
+        bne     140f
+        memcpy_short_inner_loop  backwards, 0
+140:    memcpy_short_inner_loop  backwards, 1
+
+        .cfi_endproc
+
+        .unreq  D
+        .unreq  S
+        .unreq  N
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  DAT4
+        .unreq  DAT5
+        .unreq  DAT6
+        .unreq  DAT7
+        .unreq  LAST
+        .unreq  OFF
+.endm
--- /dev/null
+++ b/arch/arm/lib/memmove_rpi.S
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <linux/linkage.h>
+#include "arm-mem.h"
+#include "memcpymove.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .arch armv6
+    .object_arch armv4
+    .arm
+    .altmacro
+    .p2align 2
+
+/*
+ * void *memmove(void *s1, const void *s2, size_t n);
+ * On entry:
+ * a1 = pointer to destination
+ * a2 = pointer to source
+ * a3 = number of bytes to copy
+ * On exit:
+ * a1 preserved
+ */
+
+.set prefetch_distance, 3
+
+ENTRY(memmove)
+        cmp     a2, a1
+        bpl     memcpy  /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
+        memcpy  1
+ENDPROC(memmove)
--- /dev/null
+++ b/arch/arm/lib/memset_rpi.S
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2013, Raspberry Pi Foundation
+Copyright (c) 2013, RISC OS Open Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <linux/linkage.h>
+#include "arm-mem.h"
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .arch armv6
+    .object_arch armv4
+    .arm
+    .altmacro
+    .p2align 2
+
+/*
+ *  void *memset(void *s, int c, size_t n);
+ *  On entry:
+ *  a1 = pointer to buffer to fill
+ *  a2 = byte pattern to fill with (caller-narrowed)
+ *  a3 = number of bytes to fill
+ *  On exit:
+ *  a1 preserved
+ */
+ENTRY(mmioset)
+ENTRY(memset)
+        S       .req    a1
+        DAT0    .req    a2
+        N       .req    a3
+        DAT1    .req    a4
+        DAT2    .req    ip
+        DAT3    .req    lr
+
+        orr     DAT0, DAT0, lsl #8
+        push    {S, lr}
+        orr     DAT0, DAT0, lsl #16
+        mov     DAT1, DAT0
+
+        /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
+        cmp     N, #31
+        blo     170f
+
+161:    sub     N, N, #16     /* simplifies inner loop termination */
+        /* Leading words and bytes */
+        tst     S, #15
+        beq     164f
+        rsb     DAT3, S, #0   /* bits 0-3 = number of leading bytes until aligned */
+        movs    DAT2, DAT3, lsl #31
+        submi   N, N, #1
+        strmib  DAT0, [S], #1
+        subcs   N, N, #2
+        strcsh  DAT0, [S], #2
+        movs    DAT2, DAT3, lsl #29
+        submi   N, N, #4
+        strmi   DAT0, [S], #4
+        subcs   N, N, #8
+        stmcsia S!, {DAT0, DAT1}
+164:    /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
+        mov     DAT2, DAT0
+        mov     DAT3, DAT0
+        /* Now the inner loop of 16-byte stores */
+165:    stmia   S!, {DAT0, DAT1, DAT2, DAT3}
+        subs    N, N, #16
+        bhs     165b
+166:    /* Trailing words and bytes */
+        movs    N, N, lsl #29
+        stmcsia S!, {DAT0, DAT1}
+        strmi   DAT0, [S], #4
+        movs    N, N, lsl #2
+        strcsh  DAT0, [S], #2
+        strmib  DAT0, [S]
+199:    pop     {S, pc}
+
+170:    /* Short case */
+        mov     DAT2, DAT0
+        mov     DAT3, DAT0
+        tst     S, #3
+        beq     174f
+172:    subs    N, N, #1
+        blo     199b
+        strb    DAT0, [S], #1
+        tst     S, #3
+        bne     172b
+174:    tst     N, #16
+        stmneia S!, {DAT0, DAT1, DAT2, DAT3}
+        b       166b
+
+        .unreq  S
+        .unreq  DAT0
+        .unreq  N
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+ENDPROC(memset)
+ENDPROC(mmioset)
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -22,6 +22,14 @@
 #include <asm/current.h>
 #include <asm/page.h>
 
+#ifndef COPY_FROM_USER_THRESHOLD
+#define COPY_FROM_USER_THRESHOLD 64
+#endif
+
+#ifndef COPY_TO_USER_THRESHOLD
+#define COPY_TO_USER_THRESHOLD 64
+#endif
+
 static int
 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
 {
@@ -84,7 +92,44 @@ pin_page_for_write(const void __user *_a
        return 1;
 }
 
-static unsigned long noinline
+static int
+pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+{
+       unsigned long addr = (unsigned long)_addr;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+       pud_t *pud;
+       spinlock_t *ptl;
+
+       pgd = pgd_offset(current->mm, addr);
+       if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
+       {
+               return 0;
+       }
+       pud = pud_offset(pgd, addr);
+       if (unlikely(pud_none(*pud) || pud_bad(*pud)))
+       {
+               return 0;
+       }
+
+       pmd = pmd_offset(pud, addr);
+       if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+               return 0;
+
+       pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+       if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
+               pte_unmap_unlock(pte, ptl);
+               return 0;
+       }
+
+       *ptep = pte;
+       *ptlp = ptl;
+
+       return 1;
+}
+
+unsigned long noinline
 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
 {
        unsigned long ua_flags;
@@ -137,6 +182,57 @@ out:
        return n;
 }
 
+unsigned long noinline
+__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
+{
+       unsigned long ua_flags;
+       int atomic;
+
+       if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+               memcpy(to, (const void *)from, n);
+               return 0;
+       }
+
+       /* the mmap semaphore is taken only if not in an atomic context */
+       atomic = in_atomic();
+
+       if (!atomic)
+               down_read(&current->mm->mmap_sem);
+       while (n) {
+               pte_t *pte;
+               spinlock_t *ptl;
+               int tocopy;
+
+               while (!pin_page_for_read(from, &pte, &ptl)) {
+                       char temp;
+                       if (!atomic)
+                               up_read(&current->mm->mmap_sem);
+                       if (__get_user(temp, (char __user *)from))
+                               goto out;
+                       if (!atomic)
+                               down_read(&current->mm->mmap_sem);
+               }
+
+               tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
+               if (tocopy > n)
+                       tocopy = n;
+
+               ua_flags = uaccess_save_and_enable();
+               memcpy(to, (const void *)from, tocopy);
+               uaccess_restore(ua_flags);
+               to += tocopy;
+               from += tocopy;
+               n -= tocopy;
+
+               pte_unmap_unlock(pte, ptl);
+       }
+       if (!atomic)
+               up_read(&current->mm->mmap_sem);
+
+out:
+       return n;
+}
+
 unsigned long
 arm_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
@@ -147,7 +243,7 @@ arm_copy_to_user(void __user *to, const
         * With frame pointer disabled, tail call optimization kicks in
         * as well making this test almost invisible.
         */
-       if (n < 64) {
+       if (n < COPY_TO_USER_THRESHOLD) {
                unsigned long ua_flags = uaccess_save_and_enable();
                n = __copy_to_user_std(to, from, n);
                uaccess_restore(ua_flags);
@@ -156,6 +252,26 @@ arm_copy_to_user(void __user *to, const
        }
        return n;
 }
+
+unsigned long __must_check
+arm_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       /*
+        * This test is stubbed out of the main function above to keep
+        * the overhead for small copies low by avoiding a large
+        * register dump on the stack just to reload them right away.
+        * With frame pointer disabled, tail call optimization kicks in
+        * as well making this test almost invisible.
+        */
+       if (n < COPY_TO_USER_THRESHOLD) {
+               unsigned long ua_flags = uaccess_save_and_enable();
+               n = __copy_from_user_std(to, from, n);
+               uaccess_restore(ua_flags);
+       } else {
+               n = __copy_from_user_memcpy(to, from, n);
+       }
+       return n;
+}
        
 static unsigned long noinline
 __clear_user_memset(void __user *addr, unsigned long n)
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -174,6 +174,13 @@ config ARCH_BCM_53573
          The base chip is BCM53573 and there are some packaging modifications
          like BCM47189 and BCM47452.
 
+config BCM2835_FAST_MEMCPY
+       bool "Enable optimized __copy_to_user and __copy_from_user"
+       depends on ARCH_BCM2835 && ARCH_MULTI_V6
+       default y
+       help
+         Optimized versions of __copy_to_user and __copy_from_user for Pi1.
+
 config ARCH_BCM_63XX
        bool "Broadcom BCM63xx DSL SoC"
        depends on ARCH_MULTI_V7