sh: Optimised memset for SH4
authorStuart Menefy <stuart.menefy@st.com>
Tue, 27 Oct 2009 15:14:06 +0000 (15:14 +0000)
committerPaul Mundt <lethal@linux-sh.org>
Tue, 24 Nov 2009 07:28:43 +0000 (16:28 +0900)
Optimised version of memset for the SH4 which uses movca.l.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
arch/sh/lib/Makefile
arch/sh/lib/memset-sh4.S [new file with mode: 0644]

index a969b47c54637518f651a2cdba8575b84f58ec45..dab4d2129812365deb1d64cf5975fa23efc8381b 100644 (file)
@@ -2,7 +2,7 @@
 # Makefile for SuperH-specific library files..
 #
 
-lib-y  = delay.o memset.o memmove.o memchr.o \
+lib-y  = delay.o memmove.o memchr.o \
         checksum.o strlen.o div64.o div64-generic.o
 
 # Extracted from libgcc
@@ -23,8 +23,11 @@ obj-y                                += io.o
 memcpy-y                       := memcpy.o
 memcpy-$(CONFIG_CPU_SH4)       := memcpy-sh4.o
 
+memset-y                       := memset.o
+memset-$(CONFIG_CPU_SH4)       := memset-sh4.o
+
 lib-$(CONFIG_MMU)              += copy_page.o __clear_user.o
 lib-$(CONFIG_MCOUNT)           += mcount.o
-lib-y                          += $(memcpy-y) $(udivsi3-y)
+lib-y                          += $(memcpy-y) $(memset-y) $(udivsi3-y)
 
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/lib/memset-sh4.S b/arch/sh/lib/memset-sh4.S
new file mode 100644 (file)
index 0000000..1a6e32c
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * "memset" implementation for SH4
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2009  STMicroelectronics Limited
+ * Author: Stuart Menefy <stuart.menefy:st.com>
+ */
+
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+
+#include <linux/linkage.h>
+
+ENTRY(memset)
+       mov     #12,r0
+       add     r6,r4
+       cmp/gt  r6,r0
+       bt/s    40f             ! if it's too small, set a byte at once
+        mov    r4,r0
+       and     #3,r0
+       cmp/eq  #0,r0
+       bt/s    2f              ! It's aligned
+        sub    r0,r6
+1:
+       dt      r0
+       bf/s    1b
+        mov.b  r5,@-r4
+2:                             ! make VVVV
+       extu.b  r5,r5
+       swap.b  r5,r0           !   V0
+       or      r0,r5           !   VV
+       swap.w  r5,r0           ! VV00
+       or      r0,r5           ! VVVV
+
+       ! Check if enough bytes need to be copied to be worth the big loop
+       mov     #0x40, r0       ! (MT)
+       cmp/gt  r6,r0           ! (MT)  64 > len => slow loop
+
+       bt/s    22f
+        mov    r6,r0
+
+       ! align the dst to the cache block size if necessary
+       mov     r4, r3
+       mov     #~(0x1f), r1
+
+       and     r3, r1
+       cmp/eq  r3, r1
+
+       bt/s    11f             ! dst is already aligned
+        sub    r1, r3          ! r3-r1 -> r3
+       shlr2   r3              ! number of loops
+
+10:    mov.l   r5,@-r4
+       dt      r3
+       bf/s    10b
+        add    #-4, r6
+
+11:    ! dst is 32byte aligned
+       mov     r6,r2
+       mov     #-5,r0
+       shld    r0,r2           ! number of loops
+
+       add     #-32, r4
+       mov     r5, r0
+12:
+       movca.l r0,@r4
+       mov.l   r5,@(4, r4)
+       mov.l   r5,@(8, r4)
+       mov.l   r5,@(12,r4)
+       mov.l   r5,@(16,r4)
+       mov.l   r5,@(20,r4)
+       add     #-0x20, r6
+       mov.l   r5,@(24,r4)
+       dt      r2
+       mov.l   r5,@(28,r4)
+       bf/s    12b
+        add    #-32, r4
+
+       add     #32, r4
+       mov     #8, r0
+       cmp/ge  r0, r6
+       bf      40f
+
+       mov     r6,r0
+22:
+       shlr2   r0
+       shlr    r0              ! r0 = r6 >> 3
+3:
+       dt      r0
+       mov.l   r5,@-r4         ! set 8-byte at once
+       bf/s    3b
+        mov.l  r5,@-r4
+       !
+       mov     #7,r0
+       and     r0,r6
+
+       ! fill bytes (length may be zero)
+40:    tst     r6,r6
+       bt      5f
+4:
+       dt      r6
+       bf/s    4b
+        mov.b  r5,@-r4
+5:
+       rts
+        mov    r4,r0