powerpc: Fix endian issues in VMX copy loops
authorAnton Blanchard <anton@samba.org>
Mon, 23 Sep 2013 02:04:35 +0000 (12:04 +1000)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Fri, 11 Oct 2013 05:48:25 +0000 (16:48 +1100)
Fix the permute loops for little endian.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/lib/copyuser_power7.S
arch/powerpc/lib/memcpy_power7.S

index d1f11795a7ad64bd6bd05beb522e07e44eb46498..e8e9c36dc7844455c4b24356cdff5f9ed9e70aff 100644 (file)
  */
 #include <asm/ppc_asm.h>
 
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB)         lvsl    VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm   VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB)         lvsr    VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm   VRT,VRB,VRA,VRC
+#endif
+
        .macro err1
 100:
        .section __ex_table,"a"
@@ -552,13 +560,13 @@ err3;     stw     r7,4(r3)
        li      r10,32
        li      r11,48
 
-       lvsl    vr16,0,r4       /* Setup permute control vector */
+       LVS(vr16,0,r4)          /* Setup permute control vector */
 err3;  lvx     vr0,0,r4
        addi    r4,r4,16
 
        bf      cr7*4+3,5f
 err3;  lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        addi    r4,r4,16
 err3;  stvx    vr8,r0,r3
        addi    r3,r3,16
@@ -566,9 +574,9 @@ err3;       stvx    vr8,r0,r3
 
 5:     bf      cr7*4+2,6f
 err3;  lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
 err3;  lvx     vr0,r4,r9
-       vperm   vr9,vr1,vr0,vr16
+       VPERM(vr9,vr1,vr0,vr16)
        addi    r4,r4,32
 err3;  stvx    vr8,r0,r3
 err3;  stvx    vr9,r3,r9
@@ -576,13 +584,13 @@ err3;     stvx    vr9,r3,r9
 
 6:     bf      cr7*4+1,7f
 err3;  lvx     vr3,r0,r4
-       vperm   vr8,vr0,vr3,vr16
+       VPERM(vr8,vr0,vr3,vr16)
 err3;  lvx     vr2,r4,r9
-       vperm   vr9,vr3,vr2,vr16
+       VPERM(vr9,vr3,vr2,vr16)
 err3;  lvx     vr1,r4,r10
-       vperm   vr10,vr2,vr1,vr16
+       VPERM(vr10,vr2,vr1,vr16)
 err3;  lvx     vr0,r4,r11
-       vperm   vr11,vr1,vr0,vr16
+       VPERM(vr11,vr1,vr0,vr16)
        addi    r4,r4,64
 err3;  stvx    vr8,r0,r3
 err3;  stvx    vr9,r3,r9
@@ -611,21 +619,21 @@ err3;     stvx    vr11,r3,r11
        .align  5
 8:
 err4;  lvx     vr7,r0,r4
-       vperm   vr8,vr0,vr7,vr16
+       VPERM(vr8,vr0,vr7,vr16)
 err4;  lvx     vr6,r4,r9
-       vperm   vr9,vr7,vr6,vr16
+       VPERM(vr9,vr7,vr6,vr16)
 err4;  lvx     vr5,r4,r10
-       vperm   vr10,vr6,vr5,vr16
+       VPERM(vr10,vr6,vr5,vr16)
 err4;  lvx     vr4,r4,r11
-       vperm   vr11,vr5,vr4,vr16
+       VPERM(vr11,vr5,vr4,vr16)
 err4;  lvx     vr3,r4,r12
-       vperm   vr12,vr4,vr3,vr16
+       VPERM(vr12,vr4,vr3,vr16)
 err4;  lvx     vr2,r4,r14
-       vperm   vr13,vr3,vr2,vr16
+       VPERM(vr13,vr3,vr2,vr16)
 err4;  lvx     vr1,r4,r15
-       vperm   vr14,vr2,vr1,vr16
+       VPERM(vr14,vr2,vr1,vr16)
 err4;  lvx     vr0,r4,r16
-       vperm   vr15,vr1,vr0,vr16
+       VPERM(vr15,vr1,vr0,vr16)
        addi    r4,r4,128
 err4;  stvx    vr8,r0,r3
 err4;  stvx    vr9,r3,r9
@@ -649,13 +657,13 @@ err4;     stvx    vr15,r3,r16
 
        bf      cr7*4+1,9f
 err3;  lvx     vr3,r0,r4
-       vperm   vr8,vr0,vr3,vr16
+       VPERM(vr8,vr0,vr3,vr16)
 err3;  lvx     vr2,r4,r9
-       vperm   vr9,vr3,vr2,vr16
+       VPERM(vr9,vr3,vr2,vr16)
 err3;  lvx     vr1,r4,r10
-       vperm   vr10,vr2,vr1,vr16
+       VPERM(vr10,vr2,vr1,vr16)
 err3;  lvx     vr0,r4,r11
-       vperm   vr11,vr1,vr0,vr16
+       VPERM(vr11,vr1,vr0,vr16)
        addi    r4,r4,64
 err3;  stvx    vr8,r0,r3
 err3;  stvx    vr9,r3,r9
@@ -665,9 +673,9 @@ err3;       stvx    vr11,r3,r11
 
 9:     bf      cr7*4+2,10f
 err3;  lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
 err3;  lvx     vr0,r4,r9
-       vperm   vr9,vr1,vr0,vr16
+       VPERM(vr9,vr1,vr0,vr16)
        addi    r4,r4,32
 err3;  stvx    vr8,r0,r3
 err3;  stvx    vr9,r3,r9
@@ -675,7 +683,7 @@ err3;       stvx    vr9,r3,r9
 
 10:    bf      cr7*4+3,11f
 err3;  lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        addi    r4,r4,16
 err3;  stvx    vr8,r0,r3
        addi    r3,r3,16
index 0663630baf3b46373905d60d96e989fee6637aa8..e4177dbea6bd6a9e59e1cfc548195b1223b8eb0d 100644 (file)
 #include <asm/ppc_asm.h>
 
 _GLOBAL(memcpy_power7)
+
+#ifdef __BIG_ENDIAN__
+#define LVS(VRT,RA,RB)         lvsl    VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm   VRT,VRA,VRB,VRC
+#else
+#define LVS(VRT,RA,RB)         lvsr    VRT,RA,RB
+#define VPERM(VRT,VRA,VRB,VRC) vperm   VRT,VRB,VRA,VRC
+#endif
+
 #ifdef CONFIG_ALTIVEC
        cmpldi  r5,16
        cmpldi  cr1,r5,4096
@@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
        li      r10,32
        li      r11,48
 
-       lvsl    vr16,0,r4       /* Setup permute control vector */
+       LVS(vr16,0,r4)          /* Setup permute control vector */
        lvx     vr0,0,r4
        addi    r4,r4,16
 
        bf      cr7*4+3,5f
        lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        addi    r4,r4,16
        stvx    vr8,r0,r3
        addi    r3,r3,16
@@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)
 
 5:     bf      cr7*4+2,6f
        lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        lvx     vr0,r4,r9
-       vperm   vr9,vr1,vr0,vr16
+       VPERM(vr9,vr1,vr0,vr16)
        addi    r4,r4,32
        stvx    vr8,r0,r3
        stvx    vr9,r3,r9
@@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)
 
 6:     bf      cr7*4+1,7f
        lvx     vr3,r0,r4
-       vperm   vr8,vr0,vr3,vr16
+       VPERM(vr8,vr0,vr3,vr16)
        lvx     vr2,r4,r9
-       vperm   vr9,vr3,vr2,vr16
+       VPERM(vr9,vr3,vr2,vr16)
        lvx     vr1,r4,r10
-       vperm   vr10,vr2,vr1,vr16
+       VPERM(vr10,vr2,vr1,vr16)
        lvx     vr0,r4,r11
-       vperm   vr11,vr1,vr0,vr16
+       VPERM(vr11,vr1,vr0,vr16)
        addi    r4,r4,64
        stvx    vr8,r0,r3
        stvx    vr9,r3,r9
@@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
        .align  5
 8:
        lvx     vr7,r0,r4
-       vperm   vr8,vr0,vr7,vr16
+       VPERM(vr8,vr0,vr7,vr16)
        lvx     vr6,r4,r9
-       vperm   vr9,vr7,vr6,vr16
+       VPERM(vr9,vr7,vr6,vr16)
        lvx     vr5,r4,r10
-       vperm   vr10,vr6,vr5,vr16
+       VPERM(vr10,vr6,vr5,vr16)
        lvx     vr4,r4,r11
-       vperm   vr11,vr5,vr4,vr16
+       VPERM(vr11,vr5,vr4,vr16)
        lvx     vr3,r4,r12
-       vperm   vr12,vr4,vr3,vr16
+       VPERM(vr12,vr4,vr3,vr16)
        lvx     vr2,r4,r14
-       vperm   vr13,vr3,vr2,vr16
+       VPERM(vr13,vr3,vr2,vr16)
        lvx     vr1,r4,r15
-       vperm   vr14,vr2,vr1,vr16
+       VPERM(vr14,vr2,vr1,vr16)
        lvx     vr0,r4,r16
-       vperm   vr15,vr1,vr0,vr16
+       VPERM(vr15,vr1,vr0,vr16)
        addi    r4,r4,128
        stvx    vr8,r0,r3
        stvx    vr9,r3,r9
@@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)
 
        bf      cr7*4+1,9f
        lvx     vr3,r0,r4
-       vperm   vr8,vr0,vr3,vr16
+       VPERM(vr8,vr0,vr3,vr16)
        lvx     vr2,r4,r9
-       vperm   vr9,vr3,vr2,vr16
+       VPERM(vr9,vr3,vr2,vr16)
        lvx     vr1,r4,r10
-       vperm   vr10,vr2,vr1,vr16
+       VPERM(vr10,vr2,vr1,vr16)
        lvx     vr0,r4,r11
-       vperm   vr11,vr1,vr0,vr16
+       VPERM(vr11,vr1,vr0,vr16)
        addi    r4,r4,64
        stvx    vr8,r0,r3
        stvx    vr9,r3,r9
@@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)
 
 9:     bf      cr7*4+2,10f
        lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        lvx     vr0,r4,r9
-       vperm   vr9,vr1,vr0,vr16
+       VPERM(vr9,vr1,vr0,vr16)
        addi    r4,r4,32
        stvx    vr8,r0,r3
        stvx    vr9,r3,r9
@@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)
 
 10:    bf      cr7*4+3,11f
        lvx     vr1,r0,r4
-       vperm   vr8,vr0,vr1,vr16
+       VPERM(vr8,vr0,vr1,vr16)
        addi    r4,r4,16
        stvx    vr8,r0,r3
        addi    r3,r3,16