powerpc/32: optimise memset()
authorChristophe Leroy <christophe.leroy@c-s.fr>
Wed, 23 Aug 2017 14:54:36 +0000 (16:54 +0200)
committerMichael Ellerman <mpe@ellerman.id.au>
Fri, 1 Sep 2017 06:42:46 +0000 (16:42 +1000)
There is no need to extend the set value to an int when the length
is lower than 4 as in that case we only do byte stores.
We can therefore immediately branch to the part handling it.
By separating it from the normal case, we are able to eliminate
a few actions on the destination pointer.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
arch/powerpc/lib/copy_32.S

index a3ffeac69eca6ca911ed82c6f3476bbd23722ece..05aaee20590f9d0bf09129776513d7bcd1a38170 100644 (file)
@@ -91,17 +91,17 @@ EXPORT_SYMBOL(memset16)
  * replaced by a nop once cache is active. This is done in machine_init()
  */
 _GLOBAL(memset)
+       cmplwi  0,r5,4
+       blt     7f
+
        rlwimi  r4,r4,8,16,23
        rlwimi  r4,r4,16,0,15
 
-       addi    r6,r3,-4
-       cmplwi  0,r5,4
-       blt     7f
-       stwu    r4,4(r6)
+       stw     r4,0(r3)
        beqlr
-       andi.   r0,r6,3
+       andi.   r0,r3,3
        add     r5,r0,r5
-       subf    r6,r0,r6
+       subf    r6,r0,r3
        cmplwi  0,r4,0
        bne     2f      /* Use normal procedure if r4 is not zero */
 _GLOBAL(memset_nocache_branch)
@@ -132,13 +132,20 @@ _GLOBAL(memset_nocache_branch)
 1:     stwu    r4,4(r6)
        bdnz    1b
 6:     andi.   r5,r5,3
-7:     cmpwi   0,r5,0
        beqlr
        mtctr   r5
        addi    r6,r6,3
 8:     stbu    r4,1(r6)
        bdnz    8b
        blr
+
+7:     cmpwi   0,r5,0
+       beqlr
+       mtctr   r5
+       addi    r6,r3,-1
+9:     stbu    r4,1(r6)
+       bdnz    9b
+       blr
 EXPORT_SYMBOL(memset)
 
 /*