sh: clear/copy_page renames in lib and lib64.

author Paul Mundt <lethal@linux-sh.org>

Wed, 21 Nov 2007 13:46:14 +0000 (22:46 +0900)

committer Paul Mundt <lethal@linux-sh.org>

Mon, 28 Jan 2008 04:18:54 +0000 (13:18 +0900)
author Paul Mundt <lethal@linux-sh.org>
Wed, 21 Nov 2007 13:46:14 +0000 (22:46 +0900)
committer Paul Mundt <lethal@linux-sh.org>
Mon, 28 Jan 2008 04:18:54 +0000 (13:18 +0900)
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile

index 6f7ac9eeb54fe46f634c6868315ae475d2897525..ebb55d1149f55ab3ea44c79e2704dc1ac2ea0cb5 100644 (file)
--- a/arch/sh/lib/Makefile
+++ b/arch/sh/lib/Makefile
@@ -8,6 +8,7 @@ lib-y  = delay.o io.o memset.o memmove.o memchr.o \
  memcpy-y                       := memcpy.o
  memcpy-$(CONFIG_CPU_SH4)       := memcpy-sh4.o
  
-lib-y  += $(memcpy-y)
+lib-$(CONFIG_MMU)              += copy_page.o clear_page.o
+lib-y                          += $(memcpy-y)
  
  EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/lib/clear_page.S b/arch/sh/lib/clear_page.S

new file mode 100644 (file)

index 0000000..7a7c81e
--- /dev/null
+++ b/arch/sh/lib/clear_page.S
@@ -0,0 +1,152 @@
+/*
+ * __clear_user_page, __clear_user, clear_page implementation of SuperH
+ *
+ * Copyright (C) 2001  Kaz Kojima
+ * Copyright (C) 2001, 2002  Niibe Yutaka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * clear_page_slow
+ * @to: P1 address
+ *
+ * void clear_page_slow(void *to)
+ */
+
+/*
+ * r0 --- scratch
+ * r4 --- to
+ * r5 --- to + PAGE_SIZE
+ */
+ENTRY(clear_page_slow)
+       mov     r4,r5
+       mov.l   .Llimit,r0
+       add     r0,r5
+       mov     #0,r0
+       !
+1:
+#if defined(CONFIG_CPU_SH3)
+       mov.l   r0,@r4
+#elif defined(CONFIG_CPU_SH4)
+       movca.l r0,@r4
+       mov     r4,r1
+#endif
+       add     #32,r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+       mov.l   r0,@-r4
+#if defined(CONFIG_CPU_SH4)
+       ocbwb   @r1
+#endif
+       cmp/eq  r5,r4
+       bf/s    1b
+        add    #28,r4
+       !
+       rts
+        nop
+.Llimit:       .long   (PAGE_SIZE-28)
+
+ENTRY(__clear_user)
+       !
+       mov     #0, r0
+       mov     #0xe0, r1       ! 0xffffffe0
+       !
+       ! r4..(r4+31)&~32          -------- not aligned [ Area 0 ]
+       ! (r4+31)&~32..(r4+r5)&~32 -------- aligned     [ Area 1 ]
+       ! (r4+r5)&~32..r4+r5       -------- not aligned [ Area 2 ]
+       !
+       ! Clear area 0
+       mov     r4, r2
+       !
+       tst     r1, r5          ! length < 32
+       bt      .Larea2         ! skip to remainder
+       !
+       add     #31, r2
+       and     r1, r2
+       cmp/eq  r4, r2
+       bt      .Larea1
+       mov     r2, r3
+       sub     r4, r3
+       mov     r3, r7
+       mov     r4, r2
+       !
+.L0:   dt      r3
+0:     mov.b   r0, @r2
+       bf/s    .L0
+        add    #1, r2
+       !
+       sub     r7, r5
+       mov     r2, r4
+.Larea1:
+       mov     r4, r3
+       add     r5, r3
+       and     r1, r3
+       cmp/hi  r2, r3
+       bf      .Larea2
+       !
+       ! Clear area 1
+#if defined(CONFIG_CPU_SH4)
+1:     movca.l r0, @r2
+#else
+1:     mov.l   r0, @r2
+#endif
+       add     #4, r2
+2:     mov.l   r0, @r2
+       add     #4, r2
+3:     mov.l   r0, @r2
+       add     #4, r2
+4:     mov.l   r0, @r2
+       add     #4, r2
+5:     mov.l   r0, @r2
+       add     #4, r2
+6:     mov.l   r0, @r2
+       add     #4, r2
+7:     mov.l   r0, @r2
+       add     #4, r2
+8:     mov.l   r0, @r2
+       add     #4, r2
+       cmp/hi  r2, r3
+       bt/s    1b
+        nop
+       !
+       ! Clear area 2
+.Larea2:
+       mov     r4, r3
+       add     r5, r3
+       cmp/hs  r3, r2
+       bt/s    .Ldone
+        sub    r2, r3
+.L2:   dt      r3
+9:     mov.b   r0, @r2
+       bf/s    .L2
+        add    #1, r2
+       !
+.Ldone:        rts
+        mov    #0, r0  ! return 0 as normal return
+
+       ! return the number of bytes remained
+.Lbad_clear_user:
+       mov     r4, r0
+       add     r5, r0
+       rts
+        sub    r2, r0
+
+.section __ex_table,"a"
+       .align 2
+       .long   0b, .Lbad_clear_user
+       .long   1b, .Lbad_clear_user
+       .long   2b, .Lbad_clear_user
+       .long   3b, .Lbad_clear_user
+       .long   4b, .Lbad_clear_user
+       .long   5b, .Lbad_clear_user
+       .long   6b, .Lbad_clear_user
+       .long   7b, .Lbad_clear_user
+       .long   8b, .Lbad_clear_user
+       .long   9b, .Lbad_clear_user
+.previous
diff --git a/arch/sh/lib/copy_page.S b/arch/sh/lib/copy_page.S

new file mode 100644 (file)

index 0000000..b879545
--- /dev/null
+++ b/arch/sh/lib/copy_page.S
@@ -0,0 +1,388 @@
+/*
+ * copy_page, __copy_user_page, __copy_user implementation of SuperH
+ *
+ * Copyright (C) 2001  Niibe Yutaka & Kaz Kojima
+ * Copyright (C) 2002  Toshinobu Sugioka
+ * Copyright (C) 2006  Paul Mundt
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * copy_page
+ * @to: P1 address
+ * @from: P1 address
+ *
+ * void copy_page(void *to, void *from)
+ */
+
+/*
+ * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch 
+ * r8 --- from + PAGE_SIZE
+ * r9 --- not used
+ * r10 --- to
+ * r11 --- from
+ */
+ENTRY(copy_page)
+       mov.l   r8,@-r15
+       mov.l   r10,@-r15
+       mov.l   r11,@-r15
+       mov     r4,r10
+       mov     r5,r11
+       mov     r5,r8
+       mov.l   .Lpsz,r0
+       add     r0,r8
+       !
+1:     mov.l   @r11+,r0
+       mov.l   @r11+,r1
+       mov.l   @r11+,r2
+       mov.l   @r11+,r3
+       mov.l   @r11+,r4
+       mov.l   @r11+,r5
+       mov.l   @r11+,r6
+       mov.l   @r11+,r7
+#if defined(CONFIG_CPU_SH3)
+       mov.l   r0,@r10
+#elif defined(CONFIG_CPU_SH4)
+       movca.l r0,@r10
+       mov     r10,r0
+#endif
+       add     #32,r10
+       mov.l   r7,@-r10
+       mov.l   r6,@-r10
+       mov.l   r5,@-r10
+       mov.l   r4,@-r10
+       mov.l   r3,@-r10
+       mov.l   r2,@-r10
+       mov.l   r1,@-r10
+#if defined(CONFIG_CPU_SH4)
+       ocbwb   @r0
+#endif
+       cmp/eq  r11,r8
+       bf/s    1b
+        add    #28,r10
+       !
+       mov.l   @r15+,r11
+       mov.l   @r15+,r10
+       mov.l   @r15+,r8
+       rts
+        nop
+
+       .align 2
+.Lpsz: .long   PAGE_SIZE
+/*
+ * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
+ * Return the number of bytes NOT copied
+ */
+#define EX(...)                        \
+       9999: __VA_ARGS__ ;             \
+       .section __ex_table, "a";       \
+       .long 9999b, 6000f      ;       \
+       .previous
+ENTRY(__copy_user)
+       ! Check if small number of bytes
+       mov     #11,r0
+       mov     r4,r3
+       cmp/gt  r0,r6           ! r6 (len) > r0 (11)
+       bf/s    .L_cleanup_loop_no_pop
+        add    r6,r3           ! last destination address
+
+       ! Calculate bytes needed to align to src
+       mov.l   r11,@-r15
+       neg     r5,r0
+       mov.l   r10,@-r15
+       add     #4,r0
+       mov.l   r9,@-r15
+       and     #3,r0
+       mov.l   r8,@-r15
+       tst     r0,r0
+       bt      2f
+
+1:
+       ! Copy bytes to long word align src
+EX(    mov.b   @r5+,r1         )
+       dt      r0
+       add     #-1,r6
+EX(    mov.b   r1,@r4          )
+       bf/s    1b
+        add    #1,r4
+
+       ! Jump to appropriate routine depending on dest
+2:     mov     #3,r1
+       mov     r6, r2
+       and     r4,r1
+       shlr2   r2
+       shll2   r1
+       mova    .L_jump_tbl,r0
+       mov.l   @(r0,r1),r1
+       jmp     @r1
+        nop
+
+       .align 2
+.L_jump_tbl:
+       .long   .L_dest00
+       .long   .L_dest01
+       .long   .L_dest10
+       .long   .L_dest11
+
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+       tst     r6,r6           ! Check explicitly for zero
+       bt      1f
+
+2:
+EX(    mov.b   @r5+,r0         )
+       dt      r6
+EX(    mov.b   r0,@r4          )
+       bf/s    2b
+        add    #1,r4
+
+1:     mov     #0,r0           ! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+       mov.l   8000f,r1
+       mov     r3,r0
+       jmp     @r1
+        sub    r4,r0
+       .align  2
+8000:  .long   5000b
+
+.previous
+       rts
+        nop
+
+! Destination = 00
+
+.L_dest00:
+       ! Skip the large copy for small transfers
+       mov     #(32+32-4), r0
+       cmp/gt  r6, r0          ! r0 (60) > r6 (len)
+       bt      1f
+
+       ! Align dest to a 32 byte boundary
+       neg     r4,r0
+       add     #0x20, r0
+       and     #0x1f, r0
+       tst     r0, r0
+       bt      2f
+
+       sub     r0, r6
+       shlr2   r0
+3:
+EX(    mov.l   @r5+,r1         )
+       dt      r0
+EX(    mov.l   r1,@r4          )
+       bf/s    3b
+        add    #4,r4
+
+2:
+EX(    mov.l   @r5+,r0         )
+EX(    mov.l   @r5+,r1         )
+EX(    mov.l   @r5+,r2         )
+EX(    mov.l   @r5+,r7         )
+EX(    mov.l   @r5+,r8         )
+EX(    mov.l   @r5+,r9         )
+EX(    mov.l   @r5+,r10        )
+EX(    mov.l   @r5+,r11        )
+#ifdef CONFIG_CPU_SH4
+EX(    movca.l r0,@r4          )
+#else
+EX(    mov.l   r0,@r4          )
+#endif
+       add     #-32, r6
+EX(    mov.l   r1,@(4,r4)      )
+       mov     #32, r0
+EX(    mov.l   r2,@(8,r4)      )
+       cmp/gt  r6, r0          ! r0 (32) > r6 (len)
+EX(    mov.l   r7,@(12,r4)     )
+EX(    mov.l   r8,@(16,r4)     )
+EX(    mov.l   r9,@(20,r4)     )
+EX(    mov.l   r10,@(24,r4)    )
+EX(    mov.l   r11,@(28,r4)    )
+       bf/s    2b
+        add    #32,r4
+
+1:     mov     r6, r0
+       shlr2   r0
+       tst     r0, r0
+       bt      .L_cleanup
+1:
+EX(    mov.l   @r5+,r1         )
+       dt      r0
+EX(    mov.l   r1,@r4          )
+       bf/s    1b
+        add    #4,r4
+
+       bra     .L_cleanup
+        nop
+
+! Destination = 10
+
+.L_dest10:
+       mov     r2,r7
+       shlr2   r7
+       shlr    r7
+       tst     r7,r7
+       mov     #7,r0
+       bt/s    1f
+        and    r0,r2
+2:
+       dt      r7
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(    mov.l   @r5+,r0         )
+EX(    mov.l   @r5+,r1         )
+EX(    mov.l   @r5+,r8         )
+EX(    mov.l   @r5+,r9         )
+EX(    mov.l   @r5+,r10        )
+EX(    mov.w   r0,@r4          )
+       add     #2,r4
+       xtrct   r1,r0
+       xtrct   r8,r1
+       xtrct   r9,r8
+       xtrct   r10,r9
+
+EX(    mov.l   r0,@r4          )
+EX(    mov.l   r1,@(4,r4)      )
+EX(    mov.l   r8,@(8,r4)      )
+EX(    mov.l   r9,@(12,r4)     )
+
+EX(    mov.l   @r5+,r1         )
+EX(    mov.l   @r5+,r8         )
+EX(    mov.l   @r5+,r0         )
+       xtrct   r1,r10
+       xtrct   r8,r1
+       xtrct   r0,r8
+       shlr16  r0
+EX(    mov.l   r10,@(16,r4)    )
+EX(    mov.l   r1,@(20,r4)     )
+EX(    mov.l   r8,@(24,r4)     )
+EX(    mov.w   r0,@(28,r4)     )
+       bf/s    2b
+        add    #30,r4
+#else
+EX(    mov.l   @(28,r5),r0     )
+EX(    mov.l   @(24,r5),r8     )
+EX(    mov.l   @(20,r5),r9     )
+EX(    mov.l   @(16,r5),r10    )
+EX(    mov.w   r0,@(30,r4)     )
+       add     #-2,r4
+       xtrct   r8,r0
+       xtrct   r9,r8
+       xtrct   r10,r9
+EX(    mov.l   r0,@(28,r4)     )
+EX(    mov.l   r8,@(24,r4)     )
+EX(    mov.l   r9,@(20,r4)     )
+
+EX(    mov.l   @(12,r5),r0     )
+EX(    mov.l   @(8,r5),r8      )
+       xtrct   r0,r10
+EX(    mov.l   @(4,r5),r9      )
+       mov.l   r10,@(16,r4)
+EX(    mov.l   @r5,r10         )
+       xtrct   r8,r0
+       xtrct   r9,r8
+       xtrct   r10,r9
+EX(    mov.l   r0,@(12,r4)     )
+EX(    mov.l   r8,@(8,r4)      )
+       swap.w  r10,r0
+EX(    mov.l   r9,@(4,r4)      )
+EX(    mov.w   r0,@(2,r4)      )
+
+       add     #32,r5
+       bf/s    2b
+        add    #34,r4
+#endif
+       tst     r2,r2
+       bt      .L_cleanup
+
+1:     ! Read longword, write two words per iteration
+EX(    mov.l   @r5+,r0         )
+       dt      r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(    mov.w   r0,@r4          )
+       shlr16  r0
+EX(    mov.w   r0,@(2,r4)      )
+#else
+EX(    mov.w   r0,@(2,r4)      )
+       shlr16  r0
+EX(    mov.w   r0,@r4          )
+#endif
+       bf/s    1b
+        add    #4,r4
+
+       bra     .L_cleanup
+        nop
+
+! Destination = 01 or 11
+
+.L_dest01:
+.L_dest11:
+       ! Read longword, write byte, word, byte per iteration
+EX(    mov.l   @r5+,r0         )
+       dt      r2
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+EX(    mov.b   r0,@r4          )
+       shlr8   r0
+       add     #1,r4
+EX(    mov.w   r0,@r4          )
+       shlr16  r0
+EX(    mov.b   r0,@(2,r4)      )
+       bf/s    .L_dest01
+        add    #3,r4
+#else
+EX(    mov.b   r0,@(3,r4)      )
+       shlr8   r0
+       swap.w  r0,r7
+EX(    mov.b   r7,@r4          )
+       add     #1,r4
+EX(    mov.w   r0,@r4          )
+       bf/s    .L_dest01
+        add    #3,r4
+#endif
+
+! Cleanup last few bytes
+.L_cleanup:
+       mov     r6,r0
+       and     #3,r0
+       tst     r0,r0
+       bt      .L_exit
+       mov     r0,r6
+
+.L_cleanup_loop:
+EX(    mov.b   @r5+,r0         )
+       dt      r6
+EX(    mov.b   r0,@r4          )
+       bf/s    .L_cleanup_loop
+        add    #1,r4
+
+.L_exit:
+       mov     #0,r0           ! normal return
+
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+       mov.l   8000f,r1
+       mov     r3,r0
+       jmp     @r1
+        sub    r4,r0
+       .align  2
+8000:  .long   5000b
+
+.previous
+       mov.l   @r15+,r8
+       mov.l   @r15+,r9
+       mov.l   @r15+,r10
+       rts
+        mov.l  @r15+,r11
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile

index 2f4086ac6f99dd760cc06d5f85eb6dea4e5a5fe1..9950966923a01aa2ec01a782d85fad6bfcf04929 100644 (file)
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -11,5 +11,5 @@
  
  # Panic should really be compiled as PIC
  lib-y  := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \
-               page_copy.o page_clear.o
+               copy_page.o clear_page.o
  
diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S

new file mode 100644 (file)

index 0000000..007ab48
--- /dev/null
+++ b/arch/sh/lib64/clear_page.S
@@ -0,0 +1,54 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of memset for the case of just clearing a page.  It turns out
+   that having the alloco's spaced out slightly due to the increment/branch
+   pair causes them to contend less for access to the cache.  Similarly,
+   keeping the stores apart from the allocos causes less contention.  => Do two
+   separate loops.  Do multiple stores per loop to amortise the
+   increment/branch cost a little.
+
+   Parameters:
+   r2 : source effective address (start of page)
+
+   Always clears 4096 bytes.
+
+   Note : alloco guarded by synco to avoid TAKum03020 erratum
+
+*/
+
+       .section .text..SHmedia32,"ax"
+       .little
+
+       .balign 8
+       .global clear_page
+clear_page:
+       pta/l 1f, tr1
+       pta/l 2f, tr2
+       ptabs/l r18, tr0
+
+       movi 4096, r7
+       add  r2, r7, r7
+       add  r2, r63, r6
+1:
+       alloco r6, 0
+       synco   ! TAKum03020
+       addi    r6, 32, r6
+       bgt/l   r7, r6, tr1
+
+       add  r2, r63, r6
+2:
+       st.q  r6,   0, r63
+       st.q  r6,   8, r63
+       st.q  r6,  16, r63
+       st.q  r6,  24, r63
+       addi r6, 32, r6
+       bgt/l r7, r6, tr2
+
+       blink tr0, r63
+
+
diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S

new file mode 100644 (file)

index 0000000..0ec6fca
--- /dev/null
+++ b/arch/sh/lib64/copy_page.S
@@ -0,0 +1,89 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : destination effective address (start of page)
+   r3 : source effective address (start of page)
+
+   Always copies 4096 bytes.
+
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+       .section .text..SHmedia32,"ax"
+       .little
+
+       .balign 8
+       .global copy_page
+copy_page:
+
+       /* Copy 4096 bytes worth of data from r3 to r2.
+          Do prefetches 4 lines ahead.
+          Do alloco 2 lines ahead */
+
+       pta 1f, tr1
+       pta 2f, tr2
+       pta 3f, tr3
+       ptabs r18, tr0
+
+#if 0
+       /* TAKum03020 */
+       ld.q r3, 0x00, r63
+       ld.q r3, 0x20, r63
+       ld.q r3, 0x40, r63
+       ld.q r3, 0x60, r63
+#endif
+       alloco r2, 0x00
+       synco           ! TAKum03020
+       alloco r2, 0x20
+       synco           ! TAKum03020
+
+       movi 3968, r6
+       add  r2, r6, r6
+       addi r6, 64, r7
+       addi r7, 64, r8
+       sub r3, r2, r60
+       addi r60, 8, r61
+       addi r61, 8, r62
+       addi r62, 8, r23
+       addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+#if 0
+       /* TAKum03020 */
+       bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
+       ldx.q r2, r22, r63 ! prefetch 4 lines hence
+#endif
+2:
+       bge/u r2, r7, tr3  ! skip alloco for last 2 lines
+       alloco r2, 0x40    ! alloc destination line 2 lines ahead
+       synco           ! TAKum03020
+3:
+       ldx.q r2, r60, r36
+       ldx.q r2, r61, r37
+       ldx.q r2, r62, r38
+       ldx.q r2, r23, r39
+       st.q  r2,   0, r36
+       st.q  r2,   8, r37
+       st.q  r2,  16, r38
+       st.q  r2,  24, r39
+       addi r2, 32, r2
+       bgt/l r8, r2, tr1
+
+       blink tr0, r63     ! return
diff --git a/arch/sh/lib64/page_clear.S b/arch/sh/lib64/page_clear.S

deleted file mode 100644 (file)

index 007ab48..0000000
--- a/arch/sh/lib64/page_clear.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
-
-   This file is subject to the terms and conditions of the GNU General Public
-   License.  See the file "COPYING" in the main directory of this archive
-   for more details.
-
-   Tight version of memset for the case of just clearing a page.  It turns out
-   that having the alloco's spaced out slightly due to the increment/branch
-   pair causes them to contend less for access to the cache.  Similarly,
-   keeping the stores apart from the allocos causes less contention.  => Do two
-   separate loops.  Do multiple stores per loop to amortise the
-   increment/branch cost a little.
-
-   Parameters:
-   r2 : source effective address (start of page)
-
-   Always clears 4096 bytes.
-
-   Note : alloco guarded by synco to avoid TAKum03020 erratum
-
-*/
-
-       .section .text..SHmedia32,"ax"
-       .little
-
-       .balign 8
-       .global clear_page
-clear_page:
-       pta/l 1f, tr1
-       pta/l 2f, tr2
-       ptabs/l r18, tr0
-
-       movi 4096, r7
-       add  r2, r7, r7
-       add  r2, r63, r6
-1:
-       alloco r6, 0
-       synco   ! TAKum03020
-       addi    r6, 32, r6
-       bgt/l   r7, r6, tr1
-
-       add  r2, r63, r6
-2:
-       st.q  r6,   0, r63
-       st.q  r6,   8, r63
-       st.q  r6,  16, r63
-       st.q  r6,  24, r63
-       addi r6, 32, r6
-       bgt/l r7, r6, tr2
-
-       blink tr0, r63
-
-
diff --git a/arch/sh/lib64/page_copy.S b/arch/sh/lib64/page_copy.S

deleted file mode 100644 (file)

index 0ec6fca..0000000
--- a/arch/sh/lib64/page_copy.S
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
-   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
-
-   This file is subject to the terms and conditions of the GNU General Public
-   License.  See the file "COPYING" in the main directory of this archive
-   for more details.
-
-   Tight version of mempy for the case of just copying a page.
-   Prefetch strategy empirically optimised against RTL simulations
-   of SH5-101 cut2 eval chip with Cayman board DDR memory.
-
-   Parameters:
-   r2 : destination effective address (start of page)
-   r3 : source effective address (start of page)
-
-   Always copies 4096 bytes.
-
-   Points to review.
-   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
-     It seems like the prefetch needs to be at at least 4 lines ahead to get
-     the data into the cache in time, and the allocos contend with outstanding
-     prefetches for the same cache set, so it's better to have the numbers
-     different.
-   */
-
-       .section .text..SHmedia32,"ax"
-       .little
-
-       .balign 8
-       .global copy_page
-copy_page:
-
-       /* Copy 4096 bytes worth of data from r3 to r2.
-          Do prefetches 4 lines ahead.
-          Do alloco 2 lines ahead */
-
-       pta 1f, tr1
-       pta 2f, tr2
-       pta 3f, tr3
-       ptabs r18, tr0
-
-#if 0
-       /* TAKum03020 */
-       ld.q r3, 0x00, r63
-       ld.q r3, 0x20, r63
-       ld.q r3, 0x40, r63
-       ld.q r3, 0x60, r63
-#endif
-       alloco r2, 0x00
-       synco           ! TAKum03020
-       alloco r2, 0x20
-       synco           ! TAKum03020
-
-       movi 3968, r6
-       add  r2, r6, r6
-       addi r6, 64, r7
-       addi r7, 64, r8
-       sub r3, r2, r60
-       addi r60, 8, r61
-       addi r61, 8, r62
-       addi r62, 8, r23
-       addi r60, 0x80, r22
-
-/* Minimal code size.  The extra branches inside the loop don't cost much
-   because they overlap with the time spent waiting for prefetches to
-   complete. */
-1:
-#if 0
-       /* TAKum03020 */
-       bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
-       ldx.q r2, r22, r63 ! prefetch 4 lines hence
-#endif
-2:
-       bge/u r2, r7, tr3  ! skip alloco for last 2 lines
-       alloco r2, 0x40    ! alloc destination line 2 lines ahead
-       synco           ! TAKum03020
-3:
-       ldx.q r2, r60, r36
-       ldx.q r2, r61, r37
-       ldx.q r2, r62, r38
-       ldx.q r2, r23, r39
-       st.q  r2,   0, r36
-       st.q  r2,   8, r37
-       st.q  r2,  16, r38
-       st.q  r2,  24, r39
-       addi r2, 32, r2
-       bgt/l r8, r2, tr1
-
-       blink tr0, r63     ! return
diff --git a/arch/sh/mm/Makefile_32 b/arch/sh/mm/Makefile_32

index 095abd14592fd0030b1c8355d9844a275e178fed..e295db60b91bd2597b28b260e4a8263e42fcb3c7 100644 (file)
--- a/arch/sh/mm/Makefile_32
+++ b/arch/sh/mm/Makefile_32
@@ -12,8 +12,7 @@ obj-$(CONFIG_SH7705_CACHE_32KB)       += cache-sh7705.o
  endif
  
  mmu-y                  := tlb-nommu.o pg-nommu.o
-mmu-$(CONFIG_MMU)      := fault_32.o clear_page.o copy_page.o tlbflush_32.o \
-                          ioremap_32.o
+mmu-$(CONFIG_MMU)      := fault_32.o tlbflush_32.o ioremap_32.o
  
  obj-y                  += $(mmu-y)
  
diff --git a/arch/sh/mm/clear_page.S b/arch/sh/mm/clear_page.S

deleted file mode 100644 (file)

index 7a7c81e..0000000
--- a/arch/sh/mm/clear_page.S
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * __clear_user_page, __clear_user, clear_page implementation of SuperH
- *
- * Copyright (C) 2001  Kaz Kojima
- * Copyright (C) 2001, 2002  Niibe Yutaka
- * Copyright (C) 2006  Paul Mundt
- */
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * clear_page_slow
- * @to: P1 address
- *
- * void clear_page_slow(void *to)
- */
-
-/*
- * r0 --- scratch
- * r4 --- to
- * r5 --- to + PAGE_SIZE
- */
-ENTRY(clear_page_slow)
-       mov     r4,r5
-       mov.l   .Llimit,r0
-       add     r0,r5
-       mov     #0,r0
-       !
-1:
-#if defined(CONFIG_CPU_SH3)
-       mov.l   r0,@r4
-#elif defined(CONFIG_CPU_SH4)
-       movca.l r0,@r4
-       mov     r4,r1
-#endif
-       add     #32,r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-       mov.l   r0,@-r4
-#if defined(CONFIG_CPU_SH4)
-       ocbwb   @r1
-#endif
-       cmp/eq  r5,r4
-       bf/s    1b
-        add    #28,r4
-       !
-       rts
-        nop
-.Llimit:       .long   (PAGE_SIZE-28)
-
-ENTRY(__clear_user)
-       !
-       mov     #0, r0
-       mov     #0xe0, r1       ! 0xffffffe0
-       !
-       ! r4..(r4+31)&~32          -------- not aligned [ Area 0 ]
-       ! (r4+31)&~32..(r4+r5)&~32 -------- aligned     [ Area 1 ]
-       ! (r4+r5)&~32..r4+r5       -------- not aligned [ Area 2 ]
-       !
-       ! Clear area 0
-       mov     r4, r2
-       !
-       tst     r1, r5          ! length < 32
-       bt      .Larea2         ! skip to remainder
-       !
-       add     #31, r2
-       and     r1, r2
-       cmp/eq  r4, r2
-       bt      .Larea1
-       mov     r2, r3
-       sub     r4, r3
-       mov     r3, r7
-       mov     r4, r2
-       !
-.L0:   dt      r3
-0:     mov.b   r0, @r2
-       bf/s    .L0
-        add    #1, r2
-       !
-       sub     r7, r5
-       mov     r2, r4
-.Larea1:
-       mov     r4, r3
-       add     r5, r3
-       and     r1, r3
-       cmp/hi  r2, r3
-       bf      .Larea2
-       !
-       ! Clear area 1
-#if defined(CONFIG_CPU_SH4)
-1:     movca.l r0, @r2
-#else
-1:     mov.l   r0, @r2
-#endif
-       add     #4, r2
-2:     mov.l   r0, @r2
-       add     #4, r2
-3:     mov.l   r0, @r2
-       add     #4, r2
-4:     mov.l   r0, @r2
-       add     #4, r2
-5:     mov.l   r0, @r2
-       add     #4, r2
-6:     mov.l   r0, @r2
-       add     #4, r2
-7:     mov.l   r0, @r2
-       add     #4, r2
-8:     mov.l   r0, @r2
-       add     #4, r2
-       cmp/hi  r2, r3
-       bt/s    1b
-        nop
-       !
-       ! Clear area 2
-.Larea2:
-       mov     r4, r3
-       add     r5, r3
-       cmp/hs  r3, r2
-       bt/s    .Ldone
-        sub    r2, r3
-.L2:   dt      r3
-9:     mov.b   r0, @r2
-       bf/s    .L2
-        add    #1, r2
-       !
-.Ldone:        rts
-        mov    #0, r0  ! return 0 as normal return
-
-       ! return the number of bytes remained
-.Lbad_clear_user:
-       mov     r4, r0
-       add     r5, r0
-       rts
-        sub    r2, r0
-
-.section __ex_table,"a"
-       .align 2
-       .long   0b, .Lbad_clear_user
-       .long   1b, .Lbad_clear_user
-       .long   2b, .Lbad_clear_user
-       .long   3b, .Lbad_clear_user
-       .long   4b, .Lbad_clear_user
-       .long   5b, .Lbad_clear_user
-       .long   6b, .Lbad_clear_user
-       .long   7b, .Lbad_clear_user
-       .long   8b, .Lbad_clear_user
-       .long   9b, .Lbad_clear_user
-.previous
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S

deleted file mode 100644 (file)

index b879545..0000000
--- a/arch/sh/mm/copy_page.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * copy_page, __copy_user_page, __copy_user implementation of SuperH
- *
- * Copyright (C) 2001  Niibe Yutaka & Kaz Kojima
- * Copyright (C) 2002  Toshinobu Sugioka
- * Copyright (C) 2006  Paul Mundt
- */
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * copy_page
- * @to: P1 address
- * @from: P1 address
- *
- * void copy_page(void *to, void *from)
- */
-
-/*
- * r0, r1, r2, r3, r4, r5, r6, r7 --- scratch 
- * r8 --- from + PAGE_SIZE
- * r9 --- not used
- * r10 --- to
- * r11 --- from
- */
-ENTRY(copy_page)
-       mov.l   r8,@-r15
-       mov.l   r10,@-r15
-       mov.l   r11,@-r15
-       mov     r4,r10
-       mov     r5,r11
-       mov     r5,r8
-       mov.l   .Lpsz,r0
-       add     r0,r8
-       !
-1:     mov.l   @r11+,r0
-       mov.l   @r11+,r1
-       mov.l   @r11+,r2
-       mov.l   @r11+,r3
-       mov.l   @r11+,r4
-       mov.l   @r11+,r5
-       mov.l   @r11+,r6
-       mov.l   @r11+,r7
-#if defined(CONFIG_CPU_SH3)
-       mov.l   r0,@r10
-#elif defined(CONFIG_CPU_SH4)
-       movca.l r0,@r10
-       mov     r10,r0
-#endif
-       add     #32,r10
-       mov.l   r7,@-r10
-       mov.l   r6,@-r10
-       mov.l   r5,@-r10
-       mov.l   r4,@-r10
-       mov.l   r3,@-r10
-       mov.l   r2,@-r10
-       mov.l   r1,@-r10
-#if defined(CONFIG_CPU_SH4)
-       ocbwb   @r0
-#endif
-       cmp/eq  r11,r8
-       bf/s    1b
-        add    #28,r10
-       !
-       mov.l   @r15+,r11
-       mov.l   @r15+,r10
-       mov.l   @r15+,r8
-       rts
-        nop
-
-       .align 2
-.Lpsz: .long   PAGE_SIZE
-/*
- * __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
- * Return the number of bytes NOT copied
- */
-#define EX(...)                        \
-       9999: __VA_ARGS__ ;             \
-       .section __ex_table, "a";       \
-       .long 9999b, 6000f      ;       \
-       .previous
-ENTRY(__copy_user)
-       ! Check if small number of bytes
-       mov     #11,r0
-       mov     r4,r3
-       cmp/gt  r0,r6           ! r6 (len) > r0 (11)
-       bf/s    .L_cleanup_loop_no_pop
-        add    r6,r3           ! last destination address
-
-       ! Calculate bytes needed to align to src
-       mov.l   r11,@-r15
-       neg     r5,r0
-       mov.l   r10,@-r15
-       add     #4,r0
-       mov.l   r9,@-r15
-       and     #3,r0
-       mov.l   r8,@-r15
-       tst     r0,r0
-       bt      2f
-
-1:
-       ! Copy bytes to long word align src
-EX(    mov.b   @r5+,r1         )
-       dt      r0
-       add     #-1,r6
-EX(    mov.b   r1,@r4          )
-       bf/s    1b
-        add    #1,r4
-
-       ! Jump to appropriate routine depending on dest
-2:     mov     #3,r1
-       mov     r6, r2
-       and     r4,r1
-       shlr2   r2
-       shll2   r1
-       mova    .L_jump_tbl,r0
-       mov.l   @(r0,r1),r1
-       jmp     @r1
-        nop
-
-       .align 2
-.L_jump_tbl:
-       .long   .L_dest00
-       .long   .L_dest01
-       .long   .L_dest10
-       .long   .L_dest11
-
-/*
- * Come here if there are less than 12 bytes to copy
- *
- * Keep the branch target close, so the bf/s callee doesn't overflow
- * and result in a more expensive branch being inserted. This is the
- * fast-path for small copies, the jump via the jump table will hit the
- * default slow-path cleanup. -PFM.
- */
-.L_cleanup_loop_no_pop:
-       tst     r6,r6           ! Check explicitly for zero
-       bt      1f
-
-2:
-EX(    mov.b   @r5+,r0         )
-       dt      r6
-EX(    mov.b   r0,@r4          )
-       bf/s    2b
-        add    #1,r4
-
-1:     mov     #0,r0           ! normal return
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-6000:
-       mov.l   8000f,r1
-       mov     r3,r0
-       jmp     @r1
-        sub    r4,r0
-       .align  2
-8000:  .long   5000b
-
-.previous
-       rts
-        nop
-
-! Destination = 00
-
-.L_dest00:
-       ! Skip the large copy for small transfers
-       mov     #(32+32-4), r0
-       cmp/gt  r6, r0          ! r0 (60) > r6 (len)
-       bt      1f
-
-       ! Align dest to a 32 byte boundary
-       neg     r4,r0
-       add     #0x20, r0
-       and     #0x1f, r0
-       tst     r0, r0
-       bt      2f
-
-       sub     r0, r6
-       shlr2   r0
-3:
-EX(    mov.l   @r5+,r1         )
-       dt      r0
-EX(    mov.l   r1,@r4          )
-       bf/s    3b
-        add    #4,r4
-
-2:
-EX(    mov.l   @r5+,r0         )
-EX(    mov.l   @r5+,r1         )
-EX(    mov.l   @r5+,r2         )
-EX(    mov.l   @r5+,r7         )
-EX(    mov.l   @r5+,r8         )
-EX(    mov.l   @r5+,r9         )
-EX(    mov.l   @r5+,r10        )
-EX(    mov.l   @r5+,r11        )
-#ifdef CONFIG_CPU_SH4
-EX(    movca.l r0,@r4          )
-#else
-EX(    mov.l   r0,@r4          )
-#endif
-       add     #-32, r6
-EX(    mov.l   r1,@(4,r4)      )
-       mov     #32, r0
-EX(    mov.l   r2,@(8,r4)      )
-       cmp/gt  r6, r0          ! r0 (32) > r6 (len)
-EX(    mov.l   r7,@(12,r4)     )
-EX(    mov.l   r8,@(16,r4)     )
-EX(    mov.l   r9,@(20,r4)     )
-EX(    mov.l   r10,@(24,r4)    )
-EX(    mov.l   r11,@(28,r4)    )
-       bf/s    2b
-        add    #32,r4
-
-1:     mov     r6, r0
-       shlr2   r0
-       tst     r0, r0
-       bt      .L_cleanup
-1:
-EX(    mov.l   @r5+,r1         )
-       dt      r0
-EX(    mov.l   r1,@r4          )
-       bf/s    1b
-        add    #4,r4
-
-       bra     .L_cleanup
-        nop
-
-! Destination = 10
-
-.L_dest10:
-       mov     r2,r7
-       shlr2   r7
-       shlr    r7
-       tst     r7,r7
-       mov     #7,r0
-       bt/s    1f
-        and    r0,r2
-2:
-       dt      r7
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(    mov.l   @r5+,r0         )
-EX(    mov.l   @r5+,r1         )
-EX(    mov.l   @r5+,r8         )
-EX(    mov.l   @r5+,r9         )
-EX(    mov.l   @r5+,r10        )
-EX(    mov.w   r0,@r4          )
-       add     #2,r4
-       xtrct   r1,r0
-       xtrct   r8,r1
-       xtrct   r9,r8
-       xtrct   r10,r9
-
-EX(    mov.l   r0,@r4          )
-EX(    mov.l   r1,@(4,r4)      )
-EX(    mov.l   r8,@(8,r4)      )
-EX(    mov.l   r9,@(12,r4)     )
-
-EX(    mov.l   @r5+,r1         )
-EX(    mov.l   @r5+,r8         )
-EX(    mov.l   @r5+,r0         )
-       xtrct   r1,r10
-       xtrct   r8,r1
-       xtrct   r0,r8
-       shlr16  r0
-EX(    mov.l   r10,@(16,r4)    )
-EX(    mov.l   r1,@(20,r4)     )
-EX(    mov.l   r8,@(24,r4)     )
-EX(    mov.w   r0,@(28,r4)     )
-       bf/s    2b
-        add    #30,r4
-#else
-EX(    mov.l   @(28,r5),r0     )
-EX(    mov.l   @(24,r5),r8     )
-EX(    mov.l   @(20,r5),r9     )
-EX(    mov.l   @(16,r5),r10    )
-EX(    mov.w   r0,@(30,r4)     )
-       add     #-2,r4
-       xtrct   r8,r0
-       xtrct   r9,r8
-       xtrct   r10,r9
-EX(    mov.l   r0,@(28,r4)     )
-EX(    mov.l   r8,@(24,r4)     )
-EX(    mov.l   r9,@(20,r4)     )
-
-EX(    mov.l   @(12,r5),r0     )
-EX(    mov.l   @(8,r5),r8      )
-       xtrct   r0,r10
-EX(    mov.l   @(4,r5),r9      )
-       mov.l   r10,@(16,r4)
-EX(    mov.l   @r5,r10         )
-       xtrct   r8,r0
-       xtrct   r9,r8
-       xtrct   r10,r9
-EX(    mov.l   r0,@(12,r4)     )
-EX(    mov.l   r8,@(8,r4)      )
-       swap.w  r10,r0
-EX(    mov.l   r9,@(4,r4)      )
-EX(    mov.w   r0,@(2,r4)      )
-
-       add     #32,r5
-       bf/s    2b
-        add    #34,r4
-#endif
-       tst     r2,r2
-       bt      .L_cleanup
-
-1:     ! Read longword, write two words per iteration
-EX(    mov.l   @r5+,r0         )
-       dt      r2
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(    mov.w   r0,@r4          )
-       shlr16  r0
-EX(    mov.w   r0,@(2,r4)      )
-#else
-EX(    mov.w   r0,@(2,r4)      )
-       shlr16  r0
-EX(    mov.w   r0,@r4          )
-#endif
-       bf/s    1b
-        add    #4,r4
-
-       bra     .L_cleanup
-        nop
-
-! Destination = 01 or 11
-
-.L_dest01:
-.L_dest11:
-       ! Read longword, write byte, word, byte per iteration
-EX(    mov.l   @r5+,r0         )
-       dt      r2
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-EX(    mov.b   r0,@r4          )
-       shlr8   r0
-       add     #1,r4
-EX(    mov.w   r0,@r4          )
-       shlr16  r0
-EX(    mov.b   r0,@(2,r4)      )
-       bf/s    .L_dest01
-        add    #3,r4
-#else
-EX(    mov.b   r0,@(3,r4)      )
-       shlr8   r0
-       swap.w  r0,r7
-EX(    mov.b   r7,@r4          )
-       add     #1,r4
-EX(    mov.w   r0,@r4          )
-       bf/s    .L_dest01
-        add    #3,r4
-#endif
-
-! Cleanup last few bytes
-.L_cleanup:
-       mov     r6,r0
-       and     #3,r0
-       tst     r0,r0
-       bt      .L_exit
-       mov     r0,r6
-
-.L_cleanup_loop:
-EX(    mov.b   @r5+,r0         )
-       dt      r6
-EX(    mov.b   r0,@r4          )
-       bf/s    .L_cleanup_loop
-        add    #1,r4
-
-.L_exit:
-       mov     #0,r0           ! normal return
-
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-6000:
-       mov.l   8000f,r1
-       mov     r3,r0
-       jmp     @r1
-        sub    r4,r0
-       .align  2
-8000:  .long   5000b
-
-.previous
-       mov.l   @r15+,r8
-       mov.l   @r15+,r9
-       mov.l   @r15+,r10
-       rts
-        mov.l  @r15+,r11
author	Paul Mundt <lethal@linux-sh.org>
	Wed, 21 Nov 2007 13:46:14 +0000 (22:46 +0900)
committer	Paul Mundt <lethal@linux-sh.org>
	Mon, 28 Jan 2008 04:18:54 +0000 (13:18 +0900)
arch/sh/lib/Makefile		patch \| blob \| blame \| history
arch/sh/lib/clear_page.S	[new file with mode: 0644]	patch \| blob
arch/sh/lib/copy_page.S	[new file with mode: 0644]	patch \| blob
arch/sh/lib64/Makefile		patch \| blob \| blame \| history
arch/sh/lib64/clear_page.S	[new file with mode: 0644]	patch \| blob
arch/sh/lib64/copy_page.S	[new file with mode: 0644]	patch \| blob
arch/sh/lib64/page_clear.S	[deleted file]	patch \| blob \| blame \| history
arch/sh/lib64/page_copy.S	[deleted file]	patch \| blob \| blame \| history
arch/sh/mm/Makefile_32		patch \| blob \| blame \| history
arch/sh/mm/clear_page.S	[deleted file]	patch \| blob \| blame \| history
arch/sh/mm/copy_page.S	[deleted file]	patch \| blob \| blame \| history