arm64/crypto: issue aese/aesmc instructions in pairs
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Tue, 17 Mar 2015 18:05:13 +0000 (18:05 +0000)
committerWill Deacon <will.deacon@arm.com>
Thu, 19 Mar 2015 10:43:57 +0000 (10:43 +0000)
This changes the AES core transform implementations to issue aese/aesmc
(and aesd/aesimc) in pairs. This enables a micro-architectural optimization
in recent Cortex-A5x cores that improves performance by 50-90%.

Measured performance in cycles per byte (Cortex-A57):

                CBC enc         CBC dec         CTR
  before        3.64            1.34            1.32
  after         1.95            0.85            0.93

Note that this results in a ~5% performance decrease for older cores.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
arch/arm64/crypto/aes-ce-ccm-core.S
arch/arm64/crypto/aes-ce.S

index 432e4841cd811b0a15c087d51fc325fec162c1de..a2a7fbcacc141ed595f31026510cecd459a733f0 100644 (file)
@@ -101,19 +101,19 @@ ENTRY(ce_aes_ccm_final)
 0:     mov     v4.16b, v3.16b
 1:     ld1     {v5.2d}, [x2], #16              /* load next round key */
        aese    v0.16b, v4.16b
-       aese    v1.16b, v4.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v4.16b
        aesmc   v1.16b, v1.16b
 2:     ld1     {v3.2d}, [x2], #16              /* load next round key */
        aese    v0.16b, v5.16b
-       aese    v1.16b, v5.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v5.16b
        aesmc   v1.16b, v1.16b
 3:     ld1     {v4.2d}, [x2], #16              /* load next round key */
        subs    w3, w3, #3
        aese    v0.16b, v3.16b
-       aese    v1.16b, v3.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v3.16b
        aesmc   v1.16b, v1.16b
        bpl     1b
        aese    v0.16b, v4.16b
@@ -146,19 +146,19 @@ ENDPROC(ce_aes_ccm_final)
        ld1     {v5.2d}, [x10], #16             /* load 2nd round key */
 2:     /* inner loop: 3 rounds, 2x interleaved */
        aese    v0.16b, v4.16b
-       aese    v1.16b, v4.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v4.16b
        aesmc   v1.16b, v1.16b
 3:     ld1     {v3.2d}, [x10], #16             /* load next round key */
        aese    v0.16b, v5.16b
-       aese    v1.16b, v5.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v5.16b
        aesmc   v1.16b, v1.16b
 4:     ld1     {v4.2d}, [x10], #16             /* load next round key */
        subs    w7, w7, #3
        aese    v0.16b, v3.16b
-       aese    v1.16b, v3.16b
        aesmc   v0.16b, v0.16b
+       aese    v1.16b, v3.16b
        aesmc   v1.16b, v1.16b
        ld1     {v5.2d}, [x10], #16             /* load next round key */
        bpl     2b
index 685a18f731eb64b1de808fa124f7e28241ab8765..78f3cfe92c0872345992203a6501ba0460cc2c91 100644 (file)
 
        .macro          do_enc_Nx, de, mc, k, i0, i1, i2, i3
        aes\de          \i0\().16b, \k\().16b
-       .ifnb           \i1
-       aes\de          \i1\().16b, \k\().16b
-       .ifnb           \i3
-       aes\de          \i2\().16b, \k\().16b
-       aes\de          \i3\().16b, \k\().16b
-       .endif
-       .endif
        aes\mc          \i0\().16b, \i0\().16b
        .ifnb           \i1
+       aes\de          \i1\().16b, \k\().16b
        aes\mc          \i1\().16b, \i1\().16b
        .ifnb           \i3
+       aes\de          \i2\().16b, \k\().16b
        aes\mc          \i2\().16b, \i2\().16b
+       aes\de          \i3\().16b, \k\().16b
        aes\mc          \i3\().16b, \i3\().16b
        .endif
        .endif