generic-poky/meta/recipes-core/glibc/glibc-2.10.1/arm-memcpy.patch

761 lines
17 KiB
Diff

Upstream-Status: Inappropriate [not used]
--- /dev/null 2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/memmove.S 2004-03-20 18:37:23.000000000 +0000
@@ -0,0 +1,251 @@
+/*
+ * Optimized memmove implementation for ARM processors
+ *
+ * Author: Nicolas Pitre
+ * Created: Dec 23, 2003
+ * Copyright: (C) MontaVista Software, Inc.
+ *
+ * This file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull lsr
+#define push lsl
+#else
+#define pull lsl
+#define push lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+ defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5TE__)
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+
+/* char * memmove (char *dst, const char *src) */
+ENTRY(memmove)
+ subs ip, r0, r1
+ cmphi r2, ip
+ bls memcpy(PLT)
+
+ stmfd sp!, {r0, r4, lr}
+ add r1, r1, r2
+ add r0, r0, r2
+ subs r2, r2, #4
+ blt 25f
+ ands ip, r0, #3
+ PLD( pld [r1, #-4] )
+ bne 26f
+ ands ip, r1, #3
+ bne 27f
+
+19: subs r2, r2, #4
+ blt 24f
+ subs r2, r2, #8
+ blt 23f
+ subs r2, r2, #16
+ blt 22f
+
+ PLD( pld [r1, #-32] )
+ PLD( subs r2, r2, #96 )
+ stmfd sp!, {r5 - r8}
+ PLD( blt 21f )
+
+ PLD( @ cache alignment )
+ PLD( ands ip, r1, #31 )
+ PLD( pld [r1, #-64] )
+ PLD( beq 20f )
+ PLD( cmp r2, ip )
+ PLD( pld [r1, #-96] )
+ PLD( blt 20f )
+ PLD( cmp ip, #16 )
+ PLD( sub r2, r2, ip )
+ PLD( ldmgedb r1!, {r3 - r6} )
+ PLD( stmgedb r0!, {r3 - r6} )
+ PLD( beq 20f )
+ PLD( and ip, ip, #15 )
+ PLD( cmp ip, #8 )
+ PLD( ldr r3, [r1, #-4]! )
+ PLD( ldrge r4, [r1, #-4]! )
+ PLD( ldrgt r5, [r1, #-4]! )
+ PLD( str r3, [r0, #-4]! )
+ PLD( strge r4, [r0, #-4]! )
+ PLD( strgt r5, [r0, #-4]! )
+
+20: PLD( pld [r1, #-96] )
+ PLD( pld [r1, #-128] )
+21: ldmdb r1!, {r3, r4, ip, lr}
+ subs r2, r2, #32
+ stmdb r0!, {r3, r4, ip, lr}
+ ldmdb r1!, {r3, r4, ip, lr}
+ stmgedb r0!, {r3, r4, ip, lr}
+ ldmgedb r1!, {r3, r4, ip, lr}
+ stmgedb r0!, {r3, r4, ip, lr}
+ ldmgedb r1!, {r3, r4, ip, lr}
+ subges r2, r2, #32
+ stmdb r0!, {r3, r4, ip, lr}
+ bge 20b
+ PLD( cmn r2, #96 )
+ PLD( bge 21b )
+ PLD( add r2, r2, #96 )
+ tst r2, #31
+ ldmfd sp!, {r5 - r8}
+ ldmeqfd sp!, {r0, r4, pc}
+
+ tst r2, #16
+22: ldmnedb r1!, {r3, r4, ip, lr}
+ stmnedb r0!, {r3, r4, ip, lr}
+
+ tst r2, #8
+23: ldmnedb r1!, {r3, r4}
+ stmnedb r0!, {r3, r4}
+
+ tst r2, #4
+24: ldrne r3, [r1, #-4]!
+ strne r3, [r0, #-4]!
+
+25: ands r2, r2, #3
+ ldmeqfd sp!, {r0, r4, pc}
+
+ cmp r2, #2
+ ldrb r3, [r1, #-1]
+ ldrgeb r4, [r1, #-2]
+ ldrgtb ip, [r1, #-3]
+ strb r3, [r0, #-1]
+ strgeb r4, [r0, #-2]
+ strgtb ip, [r0, #-3]
+ ldmfd sp!, {r0, r4, pc}
+
+26: cmp ip, #2
+ ldrb r3, [r1, #-1]!
+ ldrgeb r4, [r1, #-1]!
+ ldrgtb lr, [r1, #-1]!
+ strb r3, [r0, #-1]!
+ strgeb r4, [r0, #-1]!
+ strgtb lr, [r0, #-1]!
+ subs r2, r2, ip
+ blt 25b
+ ands ip, r1, #3
+ beq 19b
+
+27: bic r1, r1, #3
+ cmp ip, #2
+ ldr r3, [r1]
+ beq 35f
+ blt 36f
+
+
+ .macro backward_copy_shift push pull
+
+ cmp r2, #12
+ PLD( pld [r1, #-4] )
+ blt 33f
+ subs r2, r2, #28
+ stmfd sp!, {r5 - r9}
+ blt 31f
+
+ PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #-32] )
+ PLD( blt 30f )
+ PLD( pld [r1, #-64] )
+
+ PLD( @ cache alignment )
+ PLD( ands ip, r1, #31 )
+ PLD( pld [r1, #-96] )
+ PLD( beq 29f )
+ PLD( cmp r2, ip )
+ PLD( pld [r1, #-128] )
+ PLD( blt 29f )
+ PLD( sub r2, r2, ip )
+28: PLD( mov r4, r3, push #\push )
+ PLD( ldr r3, [r1, #-4]! )
+ PLD( subs ip, ip, #4 )
+ PLD( orr r4, r4, r3, pull #\pull )
+ PLD( str r4, [r0, #-4]! )
+ PLD( bgt 28b )
+
+29: PLD( pld [r1, #-128] )
+30: mov lr, r3, push #\push
+ ldmdb r1!, {r3 - r9, ip}
+ subs r2, r2, #32
+ orr lr, lr, ip, pull #\pull
+ mov ip, ip, push #\push
+ orr ip, ip, r9, pull #\pull
+ mov r9, r9, push #\push
+ orr r9, r9, r8, pull #\pull
+ mov r8, r8, push #\push
+ orr r8, r8, r7, pull #\pull
+ mov r7, r7, push #\push
+ orr r7, r7, r6, pull #\pull
+ mov r6, r6, push #\push
+ orr r6, r6, r5, pull #\pull
+ mov r5, r5, push #\push
+ orr r5, r5, r4, pull #\pull
+ mov r4, r4, push #\push
+ orr r4, r4, r3, pull #\pull
+ stmdb r0!, {r4 - r9, ip, lr}
+ bge 29b
+ PLD( cmn r2, #96 )
+ PLD( bge 30b )
+ PLD( add r2, r2, #96 )
+ cmn r2, #16
+ blt 32f
+31: mov r7, r3, push #\push
+ ldmdb r1!, {r3 - r6}
+ sub r2, r2, #16
+ orr r7, r7, r6, pull #\pull
+ mov r6, r6, push #\push
+ orr r6, r6, r5, pull #\pull
+ mov r5, r5, push #\push
+ orr r5, r5, r4, pull #\pull
+ mov r4, r4, push #\push
+ orr r4, r4, r3, pull #\pull
+ stmdb r0!, {r4 - r7}
+32: adds r2, r2, #28
+ ldmfd sp!, {r5 - r9}
+ blt 34f
+33: mov r4, r3, push #\push
+ ldr r3, [r1, #-4]!
+ subs r2, r2, #4
+ orr r4, r4, r3, pull #\pull
+ str r4, [r0, #-4]!
+ bge 33b
+34:
+ .endm
+
+
+ backward_copy_shift push=8 pull=24
+ add r1, r1, #3
+ b 25b
+
+35: backward_copy_shift push=16 pull=16
+ add r1, r1, #2
+ b 25b
+
+36: backward_copy_shift push=24 pull=8
+ add r1, r1, #1
+ b 25b
+
+ .size memmove, . - memmove
+END(memmove)
+libc_hidden_builtin_def (memmove)
--- /dev/null 2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/bcopy.S 2004-03-20 18:37:48.000000000 +0000
@@ -0,0 +1,255 @@
+/*
+ * Optimized memmove implementation for ARM processors
+ *
+ * Author: Nicolas Pitre
+ * Created: Dec 23, 2003
+ * Copyright: (C) MontaVista Software, Inc.
+ *
+ * This file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull lsr
+#define push lsl
+#else
+#define pull lsl
+#define push lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+ defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5TE__)
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+dst .req r1
+src .req r0
+
+/* void *bcopy (const char *src, char *dst, size_t size) */
+ENTRY(bcopy)
+ subs ip, dst, src
+ cmphi r2, ip
+ movls r3, r0
+ movls r0, r1
+ movls r1, r3
+ bls memcpy(PLT)
+
+ stmfd sp!, {r4, lr}
+ add src, src, r2
+ add dst, dst, r2
+ subs r2, r2, #4
+ blt 25f
+ ands ip, dst, #3
+ PLD( pld [src, #-4] )
+ bne 26f
+ ands ip, src, #3
+ bne 27f
+
+19: subs r2, r2, #4
+ blt 24f
+ subs r2, r2, #8
+ blt 23f
+ subs r2, r2, #16
+ blt 22f
+
+ PLD( pld [src, #-32] )
+ PLD( subs r2, r2, #96 )
+ stmfd sp!, {r5 - r8}
+ PLD( blt 21f )
+
+ PLD( @ cache alignment )
+ PLD( ands ip, src, #31 )
+ PLD( pld [src, #-64] )
+ PLD( beq 20f )
+ PLD( cmp r2, ip )
+ PLD( pld [src, #-96] )
+ PLD( blt 20f )
+ PLD( cmp ip, #16 )
+ PLD( sub r2, r2, ip )
+ PLD( ldmgedb src!, {r3 - r6} )
+ PLD( stmgedb dst!, {r3 - r6} )
+ PLD( beq 20f )
+ PLD( and ip, ip, #15 )
+ PLD( cmp ip, #8 )
+ PLD( ldr r3, [src, #-4]! )
+ PLD( ldrge r4, [src, #-4]! )
+ PLD( ldrgt r5, [src, #-4]! )
+ PLD( str r3, [dst, #-4]! )
+ PLD( strge r4, [dst, #-4]! )
+ PLD( strgt r5, [dst, #-4]! )
+
+20: PLD( pld [src, #-96] )
+ PLD( pld [src, #-128] )
+21: ldmdb src!, {r3, r4, ip, lr}
+ subs r2, r2, #32
+ stmdb dst!, {r3, r4, ip, lr}
+ ldmdb src!, {r3, r4, ip, lr}
+ stmgedb dst!, {r3, r4, ip, lr}
+ ldmgedb src!, {r3, r4, ip, lr}
+ stmgedb dst!, {r3, r4, ip, lr}
+ ldmgedb src!, {r3, r4, ip, lr}
+ subges r2, r2, #32
+ stmdb dst!, {r3, r4, ip, lr}
+ bge 20b
+ PLD( cmn r2, #96 )
+ PLD( bge 21b )
+ PLD( add r2, r2, #96 )
+ tst r2, #31
+ ldmfd sp!, {r5 - r8}
+ ldmeqfd sp!, {r4, pc}
+
+ tst r2, #16
+22: ldmnedb src!, {r3, r4, ip, lr}
+ stmnedb dst!, {r3, r4, ip, lr}
+
+ tst r2, #8
+23: ldmnedb src!, {r3, r4}
+ stmnedb dst!, {r3, r4}
+
+ tst r2, #4
+24: ldrne r3, [src, #-4]!
+ strne r3, [dst, #-4]!
+
+25: ands r2, r2, #3
+ ldmeqfd sp!, {dst, r4, pc}
+
+ cmp r2, #2
+ ldrb r3, [src, #-1]
+ ldrgeb r4, [src, #-2]
+ ldrgtb ip, [src, #-3]
+ strb r3, [dst, #-1]
+ strgeb r4, [dst, #-2]
+ strgtb ip, [dst, #-3]
+ ldmfd sp!, {dst, r4, pc}
+
+26: cmp ip, #2
+ ldrb r3, [src, #-1]!
+ ldrgeb r4, [src, #-1]!
+ ldrgtb lr, [src, #-1]!
+ strb r3, [dst, #-1]!
+ strgeb r4, [dst, #-1]!
+ strgtb lr, [dst, #-1]!
+ subs r2, r2, ip
+ blt 25b
+ ands ip, src, #3
+ beq 19b
+
+27: bic src, src, #3
+ cmp ip, #2
+ ldr r3, [src]
+ beq 35f
+ blt 36f
+
+
+ .macro backward_copy_shift push pull
+
+ cmp r2, #12
+ PLD( pld [src, #-4] )
+ blt 33f
+ subs r2, r2, #28
+ stmfd sp!, {r5 - r9}
+ blt 31f
+
+ PLD( subs r2, r2, #96 )
+ PLD( pld [src, #-32] )
+ PLD( blt 30f )
+ PLD( pld [src, #-64] )
+
+ PLD( @ cache alignment )
+ PLD( ands ip, src, #31 )
+ PLD( pld [src, #-96] )
+ PLD( beq 29f )
+ PLD( cmp r2, ip )
+ PLD( pld [src, #-128] )
+ PLD( blt 29f )
+ PLD( sub r2, r2, ip )
+28: PLD( mov r4, r3, push #\push )
+ PLD( ldr r3, [src, #-4]! )
+ PLD( subs ip, ip, #4 )
+ PLD( orr r4, r4, r3, pull #\pull )
+ PLD( str r4, [dst, #-4]! )
+ PLD( bgt 28b )
+
+29: PLD( pld [src, #-128] )
+30: mov lr, r3, push #\push
+ ldmdb src!, {r3 - r9, ip}
+ subs r2, r2, #32
+ orr lr, lr, ip, pull #\pull
+ mov ip, ip, push #\push
+ orr ip, ip, r9, pull #\pull
+ mov r9, r9, push #\push
+ orr r9, r9, r8, pull #\pull
+ mov r8, r8, push #\push
+ orr r8, r8, r7, pull #\pull
+ mov r7, r7, push #\push
+ orr r7, r7, r6, pull #\pull
+ mov r6, r6, push #\push
+ orr r6, r6, r5, pull #\pull
+ mov r5, r5, push #\push
+ orr r5, r5, r4, pull #\pull
+ mov r4, r4, push #\push
+ orr r4, r4, r3, pull #\pull
+ stmdb dst!, {r4 - r9, ip, lr}
+ bge 29b
+ PLD( cmn r2, #96 )
+ PLD( bge 30b )
+ PLD( add r2, r2, #96 )
+ cmn r2, #16
+ blt 32f
+31: mov r7, r3, push #\push
+ ldmdb src!, {r3 - r6}
+ sub r2, r2, #16
+ orr r7, r7, r6, pull #\pull
+ mov r6, r6, push #\push
+ orr r6, r6, r5, pull #\pull
+ mov r5, r5, push #\push
+ orr r5, r5, r4, pull #\pull
+ mov r4, r4, push #\push
+ orr r4, r4, r3, pull #\pull
+ stmdb dst!, {r4 - r7}
+32: adds r2, r2, #28
+ ldmfd sp!, {r5 - r9}
+ blt 34f
+33: mov r4, r3, push #\push
+ ldr r3, [src, #-4]!
+ subs r2, r2, #4
+ orr r4, r4, r3, pull #\pull
+ str r4, [dst, #-4]!
+ bge 33b
+34:
+ .endm
+
+
+ backward_copy_shift push=8 pull=24
+ add src, src, #3
+ b 25b
+
+35: backward_copy_shift push=16 pull=16
+ add src, src, #2
+ b 25b
+
+36: backward_copy_shift push=24 pull=8
+ add src, src, #1
+ b 25b
+
+ .size bcopy, . - bcopy
+END(bcopy)
--- /dev/null 2004-02-02 20:32:13.000000000 +0000
+++ sysdeps/arm/memcpy.S 2004-05-02 14:33:22.000000000 +0100
@@ -0,0 +1,242 @@
+/*
+ * Optimized memcpy implementation for ARM processors
+ *
+ * Author: Nicolas Pitre
+ * Created: Dec 23, 2003
+ * Copyright: (C) MontaVista Software, Inc.
+ *
+ * This file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#include <sysdep.h>
+
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull lsr
+#define push lsl
+#else
+#define pull lsl
+#define push lsr
+#endif
+
+/*
+ * Enable data preload for architectures that support it (ARMv5 and above)
+ */
+#if defined(__ARM_ARCH_5__) || \
+ defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5TE__)
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+
+/* char * memcpy (char *dst, const char *src) */
+
+ENTRY(memcpy)
+ subs r2, r2, #4
+ stmfd sp!, {r0, r4, lr}
+ blt 7f
+ ands ip, r0, #3
+ PLD( pld [r1, #0] )
+ bne 8f
+ ands ip, r1, #3
+ bne 9f
+
+1: subs r2, r2, #4
+ blt 6f
+ subs r2, r2, #8
+ blt 5f
+ subs r2, r2, #16
+ blt 4f
+
+ PLD( subs r2, r2, #65 )
+ stmfd sp!, {r5 - r8}
+ PLD( blt 3f )
+ PLD( pld [r1, #32] )
+
+ PLD( @ cache alignment )
+ PLD( ands ip, r1, #31 )
+ PLD( pld [r1, #64] )
+ PLD( beq 2f )
+ PLD( rsb ip, ip, #32 )
+ PLD( cmp r2, ip )
+ PLD( pld [r1, #96] )
+ PLD( blt 2f )
+ PLD( cmp ip, #16 )
+ PLD( sub r2, r2, ip )
+ PLD( ldmgeia r1!, {r3 - r6} )
+ PLD( stmgeia r0!, {r3 - r6} )
+ PLD( beq 2f )
+ PLD( and ip, ip, #15 )
+ PLD( cmp ip, #8 )
+ PLD( ldr r3, [r1], #4 )
+ PLD( ldrge r4, [r1], #4 )
+ PLD( ldrgt r5, [r1], #4 )
+ PLD( str r3, [r0], #4 )
+ PLD( strge r4, [r0], #4 )
+ PLD( strgt r5, [r0], #4 )
+
+2: PLD( pld [r1, #96] )
+3: ldmia r1!, {r3 - r8, ip, lr}
+ subs r2, r2, #32
+ stmia r0!, {r3 - r8, ip, lr}
+ bge 2b
+ PLD( cmn r2, #65 )
+ PLD( bge 3b )
+ PLD( add r2, r2, #65 )
+ tst r2, #31
+ ldmfd sp!, {r5 - r8}
+ ldmeqfd sp!, {r0, r4, pc}
+
+ tst r2, #16
+4: ldmneia r1!, {r3, r4, ip, lr}
+ stmneia r0!, {r3, r4, ip, lr}
+
+ tst r2, #8
+5: ldmneia r1!, {r3, r4}
+ stmneia r0!, {r3, r4}
+
+ tst r2, #4
+6: ldrne r3, [r1], #4
+ strne r3, [r0], #4
+
+7: ands r2, r2, #3
+ ldmeqfd sp!, {r0, r4, pc}
+
+ cmp r2, #2
+ ldrb r3, [r1], #1
+ ldrgeb r4, [r1], #1
+ ldrgtb ip, [r1]
+ strb r3, [r0], #1
+ strgeb r4, [r0], #1
+ strgtb ip, [r0]
+ ldmfd sp!, {r0, r4, pc}
+
+8: rsb ip, ip, #4
+ cmp ip, #2
+ ldrb r3, [r1], #1
+ ldrgeb r4, [r1], #1
+ ldrgtb lr, [r1], #1
+ strb r3, [r0], #1
+ strgeb r4, [r0], #1
+ strgtb lr, [r0], #1
+ subs r2, r2, ip
+ blt 7b
+ ands ip, r1, #3
+ beq 1b
+
+9: bic r1, r1, #3
+ cmp ip, #2
+ ldr lr, [r1], #4
+ beq 17f
+ bgt 18f
+
+
+ .macro forward_copy_shift pull push
+
+ cmp r2, #12
+ PLD( pld [r1, #0] )
+ blt 15f
+ subs r2, r2, #28
+ stmfd sp!, {r5 - r9}
+ blt 13f
+
+ PLD( subs r2, r2, #97 )
+ PLD( blt 12f )
+ PLD( pld [r1, #32] )
+
+ PLD( @ cache alignment )
+ PLD( rsb ip, r1, #36 )
+ PLD( pld [r1, #64] )
+ PLD( ands ip, ip, #31 )
+ PLD( pld [r1, #96] )
+ PLD( beq 11f )
+ PLD( cmp r2, ip )
+ PLD( pld [r1, #128] )
+ PLD( blt 11f )
+ PLD( sub r2, r2, ip )
+10: PLD( mov r3, lr, pull #\pull )
+ PLD( ldr lr, [r1], #4 )
+ PLD( subs ip, ip, #4 )
+ PLD( orr r3, r3, lr, push #\push )
+ PLD( str r3, [r0], #4 )
+ PLD( bgt 10b )
+
+11: PLD( pld [r1, #128] )
+12: mov r3, lr, pull #\pull
+ ldmia r1!, {r4 - r9, ip, lr}
+ subs r2, r2, #32
+ orr r3, r3, r4, push #\push
+ mov r4, r4, pull #\pull
+ orr r4, r4, r5, push #\push
+ mov r5, r5, pull #\pull
+ orr r5, r5, r6, push #\push
+ mov r6, r6, pull #\pull
+ orr r6, r6, r7, push #\push
+ mov r7, r7, pull #\pull
+ orr r7, r7, r8, push #\push
+ mov r8, r8, pull #\pull
+ orr r8, r8, r9, push #\push
+ mov r9, r9, pull #\pull
+ orr r9, r9, ip, push #\push
+ mov ip, ip, pull #\pull
+ orr ip, ip, lr, push #\push
+ stmia r0!, {r3 - r9, ip}
+ bge 11b
+ PLD( cmn r2, #97 )
+ PLD( bge 12b )
+ PLD( add r2, r2, #97 )
+ cmn r2, #16
+ blt 14f
+13: mov r3, lr, pull #\pull
+ ldmia r1!, {r4 - r6, lr}
+ sub r2, r2, #16
+ orr r3, r3, r4, push #\push
+ mov r4, r4, pull #\pull
+ orr r4, r4, r5, push #\push
+ mov r5, r5, pull #\pull
+ orr r5, r5, r6, push #\push
+ mov r6, r6, pull #\pull
+ orr r6, r6, lr, push #\push
+ stmia r0!, {r3 - r6}
+14: adds r2, r2, #28
+ ldmfd sp!, {r5 - r9}
+ blt 16f
+15: mov r3, lr, pull #\pull
+ ldr lr, [r1], #4
+ subs r2, r2, #4
+ orr r3, r3, lr, push #\push
+ str r3, [r0], #4
+ bge 15b
+16:
+ .endm
+
+
+ forward_copy_shift pull=8 push=24
+ sub r1, r1, #3
+ b 7b
+
+17: forward_copy_shift pull=16 push=16
+ sub r1, r1, #2
+ b 7b
+
+18: forward_copy_shift pull=24 push=8
+ sub r1, r1, #1
+ b 7b
+
+ .size memcpy, . - memcpy
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+