ARM: 7626/1: arm/crypto: Make asm SHA-1 and AES code Thumb-2 compatible
authorDave Martin <dave.martin@linaro.org>
Thu, 10 Jan 2013 11:20:15 +0000 (12:20 +0100)
committerRussell King <rmk+kernel@arm.linux.org.uk>
Sun, 13 Jan 2013 12:41:22 +0000 (12:41 +0000)
This patch fixes aes-armv4.S and sha1-armv4-large.S to work
natively in Thumb.  This allows ARM/Thumb interworking workarounds
to be removed.

I also take the opportunity to convert some explicit assembler
directives for exported functions to the standard
ENTRY()/ENDPROC().

For the code itself:

  * In sha1_block_data_order, use of TEQ with sp is deprecated in
    ARMv7 and not supported in Thumb.  For the branches back to
    .L_00_15 and .L_40_59, the TEQ is converted to a CMP, under the
    assumption that clobbering the C flag here will not cause
    incorrect behaviour.

    For the first branch back to .L_20_39_or_60_79 the C flag is
    important, so sp is moved temporarily into another register so
    that TEQ can be used for the comparison.

  * In the AES code, most forms of register-indexed addressing with
    shifts and rotates are not permitted for loads and stores in
    Thumb, so the address calculation is done using a separate
    instruction for the Thumb case.

The resulting code is unlikely to be optimally scheduled, but it
should not have a large impact given the overall size of the code.
I haven't run any benchmarks.

Signed-off-by: Dave Martin <dave.martin@linaro.org>
Tested-by: David McCullough <ucdevel@gmail.com> (ARM only)
Acked-by: David McCullough <ucdevel@gmail.com>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
arch/arm/crypto/aes-armv4.S
arch/arm/crypto/sha1-armv4-large.S

index e59b1d505d6c84ca738351a478901582e1f2ad0b..19d6cd6f29f98b95962cca5643dca61debdf7b8e 100644 (file)
@@ -34,8 +34,9 @@
 @ A little glue here to select the correct code below for the ARM CPU
 @ that is being targetted.
 
+#include <linux/linkage.h>
+
 .text
-.code  32
 
 .type  AES_Te,%object
 .align 5
@@ -145,10 +146,8 @@ AES_Te:
 
 @ void AES_encrypt(const unsigned char *in, unsigned char *out,
 @               const AES_KEY *key) {
-.global AES_encrypt
-.type   AES_encrypt,%function
 .align 5
-AES_encrypt:
+ENTRY(AES_encrypt)
        sub     r3,pc,#8                @ AES_encrypt
        stmdb   sp!,{r1,r4-r12,lr}
        mov     r12,r0          @ inp
@@ -239,15 +238,8 @@ AES_encrypt:
        strb    r6,[r12,#14]
        strb    r3,[r12,#15]
 #endif
-#if __ARM_ARCH__>=5
        ldmia   sp!,{r4-r12,pc}
-#else
-       ldmia   sp!,{r4-r12,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
-#endif
-.size  AES_encrypt,.-AES_encrypt
+ENDPROC(AES_encrypt)
 
 .type   _armv4_AES_encrypt,%function
 .align 2
@@ -386,10 +378,8 @@ _armv4_AES_encrypt:
        ldr     pc,[sp],#4              @ pop and return
 .size  _armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global private_AES_set_encrypt_key
-.type   private_AES_set_encrypt_key,%function
 .align 5
-private_AES_set_encrypt_key:
+ENTRY(private_AES_set_encrypt_key)
 _armv4_AES_set_encrypt_key:
        sub     r3,pc,#8                @ AES_set_encrypt_key
        teq     r0,#0
@@ -658,15 +648,11 @@ _armv4_AES_set_encrypt_key:
 
 .Ldone:        mov     r0,#0
        ldmia   sp!,{r4-r12,lr}
-.Labrt:        tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
-.size  private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+.Labrt:        mov     pc,lr
+ENDPROC(private_AES_set_encrypt_key)
 
-.global private_AES_set_decrypt_key
-.type   private_AES_set_decrypt_key,%function
 .align 5
-private_AES_set_decrypt_key:
+ENTRY(private_AES_set_decrypt_key)
        str     lr,[sp,#-4]!            @ push lr
 #if 0
        @ kernel does both of these in setkey so optimise this bit out by
@@ -748,15 +734,8 @@ private_AES_set_decrypt_key:
        bne     .Lmix
 
        mov     r0,#0
-#if __ARM_ARCH__>=5
        ldmia   sp!,{r4-r12,pc}
-#else
-       ldmia   sp!,{r4-r12,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
-#endif
-.size  private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+ENDPROC(private_AES_set_decrypt_key)
 
 .type  AES_Td,%object
 .align 5
@@ -862,10 +841,8 @@ AES_Td:
 
 @ void AES_decrypt(const unsigned char *in, unsigned char *out,
 @               const AES_KEY *key) {
-.global AES_decrypt
-.type   AES_decrypt,%function
 .align 5
-AES_decrypt:
+ENTRY(AES_decrypt)
        sub     r3,pc,#8                @ AES_decrypt
        stmdb   sp!,{r1,r4-r12,lr}
        mov     r12,r0          @ inp
@@ -956,15 +933,8 @@ AES_decrypt:
        strb    r6,[r12,#14]
        strb    r3,[r12,#15]
 #endif
-#if __ARM_ARCH__>=5
        ldmia   sp!,{r4-r12,pc}
-#else
-       ldmia   sp!,{r4-r12,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
-#endif
-.size  AES_decrypt,.-AES_decrypt
+ENDPROC(AES_decrypt)
 
 .type   _armv4_AES_decrypt,%function
 .align 2
@@ -1064,7 +1034,9 @@ _armv4_AES_decrypt:
        and     r9,lr,r1,lsr#8
 
        ldrb    r7,[r10,r7]             @ Td4[s1>>0]
-       ldrb    r1,[r10,r1,lsr#24]      @ Td4[s1>>24]
+ ARM(  ldrb    r1,[r10,r1,lsr#24]  )   @ Td4[s1>>24]
+ THUMB(        add     r1,r10,r1,lsr#24    )   @ Td4[s1>>24]
+ THUMB(        ldrb    r1,[r1]             )
        ldrb    r8,[r10,r8]             @ Td4[s1>>16]
        eor     r0,r7,r0,lsl#24
        ldrb    r9,[r10,r9]             @ Td4[s1>>8]
@@ -1077,7 +1049,9 @@ _armv4_AES_decrypt:
        ldrb    r8,[r10,r8]             @ Td4[s2>>0]
        and     r9,lr,r2,lsr#16
 
-       ldrb    r2,[r10,r2,lsr#24]      @ Td4[s2>>24]
+ ARM(  ldrb    r2,[r10,r2,lsr#24]  )   @ Td4[s2>>24]
+ THUMB(        add     r2,r10,r2,lsr#24    )   @ Td4[s2>>24]
+ THUMB(        ldrb    r2,[r2]             )
        eor     r0,r0,r7,lsl#8
        ldrb    r9,[r10,r9]             @ Td4[s2>>16]
        eor     r1,r8,r1,lsl#16
@@ -1090,7 +1064,9 @@ _armv4_AES_decrypt:
        and     r9,lr,r3                @ i2
 
        ldrb    r9,[r10,r9]             @ Td4[s3>>0]
-       ldrb    r3,[r10,r3,lsr#24]      @ Td4[s3>>24]
+ ARM(  ldrb    r3,[r10,r3,lsr#24]  )   @ Td4[s3>>24]
+ THUMB(        add     r3,r10,r3,lsr#24    )   @ Td4[s3>>24]
+ THUMB(        ldrb    r3,[r3]             )
        eor     r0,r0,r7,lsl#16
        ldr     r7,[r11,#0]
        eor     r1,r1,r8,lsl#8
index 7050ab133b9da66561d53f46f07f5974a16ccf7c..92c6eed7aac9cded01cd9bd259deba3bd6fda782 100644 (file)
 @ Profiler-assisted and platform-specific optimization resulted in 10%
 @ improvement on Cortex A8 core and 12.2 cycles per byte.
 
-.text
+#include <linux/linkage.h>
 
-.global        sha1_block_data_order
-.type  sha1_block_data_order,%function
+.text
 
 .align 2
-sha1_block_data_order:
+ENTRY(sha1_block_data_order)
        stmdb   sp!,{r4-r12,lr}
        add     r2,r1,r2,lsl#6  @ r2 to point at the end of r1
        ldmia   r0,{r3,r4,r5,r6,r7}
@@ -194,7 +193,7 @@ sha1_block_data_order:
        eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
-       teq     r14,sp
+       cmp     r14,sp
        bne     .L_00_15                @ [((11+4)*5+2)*3]
 #if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
@@ -374,7 +373,9 @@ sha1_block_data_order:
                                                @ F_xx_xx
        add     r3,r3,r9                        @ E+=X[i]
        add     r3,r3,r10                       @ E+=F_20_39(B,C,D)
-       teq     r14,sp                  @ preserve carry
+ ARM(  teq     r14,sp          )       @ preserve carry
+ THUMB(        mov     r11,sp          )
+ THUMB(        teq     r14,r11         )       @ preserve carry
        bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
        bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
 
@@ -466,7 +467,7 @@ sha1_block_data_order:
        add     r3,r3,r9                        @ E+=X[i]
        add     r3,r3,r10                       @ E+=F_40_59(B,C,D)
        add     r3,r3,r11,ror#2
-       teq     r14,sp
+       cmp     r14,sp
        bne     .L_40_59                @ [+((12+5)*5+2)*4]
 
        ldr     r8,.LK_60_79
@@ -485,19 +486,12 @@ sha1_block_data_order:
        teq     r1,r2
        bne     .Lloop                  @ [+18], total 1307
 
-#if __ARM_ARCH__>=5
        ldmia   sp!,{r4-r12,pc}
-#else
-       ldmia   sp!,{r4-r12,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
-#endif
 .align 2
 .LK_00_19:     .word   0x5a827999
 .LK_20_39:     .word   0x6ed9eba1
 .LK_40_59:     .word   0x8f1bbcdc
 .LK_60_79:     .word   0xca62c1d6
-.size  sha1_block_data_order,.-sha1_block_data_order
+ENDPROC(sha1_block_data_order)
 .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
 .align 2