From e5577b7a50f1c7c002a05082d2a650adf6baa8bf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 24 Jul 2018 18:29:07 -0700 Subject: [PATCH] UPSTREAM: crypto: arm/chacha20 - always use vrev for 16-bit rotates The 4-way ChaCha20 NEON code implements 16-bit rotates with vrev32.16, but the one-way code (used on remainder blocks) implements it with vshl + vsri, which is slower. Switch the one-way code to vrev32.16 too. Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu (cherry picked from commit 4e34e51f48ab7f77a4022aa810a786daa3eb3e22) Bug: 112008522 Test: As series, see Ic61c13b53facfd2173065be715a7ee5f3af8760b Change-Id: If6f8ea9545aa6ed0478e812d6e17400c186b003b Signed-off-by: Eric Biggers --- arch/arm/crypto/chacha20-neon-core.S | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S index 3fecb2124c35..451a849ad518 100644 --- a/arch/arm/crypto/chacha20-neon-core.S +++ b/arch/arm/crypto/chacha20-neon-core.S @@ -51,9 +51,8 @@ ENTRY(chacha20_block_xor_neon) .Ldoubleround: // x0 += x1, x3 = rotl32(x3 ^ x0, 16) vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #16 - vsri.u32 q3, q4, #16 + veor q3, q3, q0 + vrev32.16 q3, q3 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) vadd.i32 q2, q2, q3 @@ -82,9 +81,8 @@ ENTRY(chacha20_block_xor_neon) // x0 += x1, x3 = rotl32(x3 ^ x0, 16) vadd.i32 q0, q0, q1 - veor q4, q3, q0 - vshl.u32 q3, q4, #16 - vsri.u32 q3, q4, #16 + veor q3, q3, q0 + vrev32.16 q3, q3 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) vadd.i32 q2, q2, q3 -- 2.20.1