crypto: arm64/sha2 - integrate OpenSSL implementations of SHA256/SHA512
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Sun, 20 Nov 2016 11:42:01 +0000 (11:42 +0000)
committerHerbert Xu <herbert@gondor.apana.org.au>
Mon, 28 Nov 2016 11:58:05 +0000 (19:58 +0800)
This integrates both the accelerated scalar and the NEON implementations
of SHA-224/256 as well as SHA-384/512 from the OpenSSL project.

Relative performance compared to the respective generic C versions:

                 |  SHA256-scalar  | SHA256-NEON* |  SHA512  |
     ------------+-----------------+--------------+----------+
     Cortex-A53  |      1.63x      |     1.63x    |   2.34x  |
     Cortex-A57  |      1.43x      |     1.59x    |   1.95x  |
     Cortex-A73  |      1.26x      |     1.56x    |     ?    |

The core crypto code was authored by Andy Polyakov of the OpenSSL
project, in collaboration with whom the upstream code was adapted so
that this module can be built from the same version of sha512-armv8.pl.

The version in this patch was taken from OpenSSL commit 32bbb62ea634
("sha/asm/sha512-armv8.pl: fix big-endian support in __KERNEL__ case.")

* The core SHA algorithm is fundamentally sequential, but there is a
  secondary transformation involved, called the schedule update, which
  can be performed independently. The NEON version of SHA-224/SHA-256
  only implements this part of the algorithm using NEON instructions,
  the sequential part is always done using scalar instructions.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/Kconfig
arch/arm64/crypto/Makefile
arch/arm64/crypto/sha256-core.S_shipped [new file with mode: 0644]
arch/arm64/crypto/sha256-glue.c [new file with mode: 0644]
arch/arm64/crypto/sha512-armv8.pl [new file with mode: 0644]
arch/arm64/crypto/sha512-core.S_shipped [new file with mode: 0644]
arch/arm64/crypto/sha512-glue.c [new file with mode: 0644]

index 2cf32e9887e1b292a0ec9102761ce3a200b8fa0d..5f4a617e2957ebe5f989324928b2f50ebcc02598 100644 (file)
@@ -8,6 +8,14 @@ menuconfig ARM64_CRYPTO
 
 if ARM64_CRYPTO
 
+config CRYPTO_SHA256_ARM64
+       tristate "SHA-224/SHA-256 digest algorithm for arm64"
+       select CRYPTO_HASH
+
+config CRYPTO_SHA512_ARM64
+       tristate "SHA-384/SHA-512 digest algorithm for arm64"
+       select CRYPTO_HASH
+
 config CRYPTO_SHA1_ARM64_CE
        tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
        depends on ARM64 && KERNEL_MODE_NEON
index abb79b3cfcfea158cdcaa8ac1ffcbd32699da9b0..7707867280823a8930e8f356cf868e2a48c691f5 100644 (file)
@@ -29,6 +29,12 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
+obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
+sha256-arm64-y := sha256-glue.o sha256-core.o
+
+obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
+sha512-arm64-y := sha512-glue.o sha512-core.o
+
 AFLAGS_aes-ce.o                := -DINTERLEAVE=4
 AFLAGS_aes-neon.o      := -DINTERLEAVE=4
 
@@ -40,3 +46,14 @@ CFLAGS_crc32-arm64.o := -mcpu=generic+crc
 
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
        $(call if_changed_rule,cc_o_c)
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
+       $(call cmd,perlasm)
+
+$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
+       $(call cmd,perlasm)
+
+.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
new file mode 100644 (file)
index 0000000..3ce82cc
--- /dev/null
@@ -0,0 +1,2061 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//             SHA256-hw       SHA256(*)       SHA512
+// Apple A7    1.97            10.5 (+33%)     6.73 (-1%(**))
+// Cortex-A53  2.38            15.5 (+115%)    10.0 (+150%(***))
+// Cortex-A57  2.31            11.6 (+86%)     7.51 (+260%(***))
+// Denver      2.01            10.5 (+26%)     6.70 (+8%)
+// X-Gene                      20.0 (+100%)    12.8 (+300%(***))
+// Mongoose    2.36            13.0 (+50%)     8.36 (+33%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+//     mostly for informational purposes.
+// (**)        The result is a trade-off: it's possible to improve it by
+//     10% (or by 1 cycle per round), but at the cost of 20% loss
+//     on Cortex-A53 (or by 4 cycles per round).
+// (***)       Super-impressive coefficients over gcc-generated code are
+//     indication of some compiler "pathology", most notably code
+//     generated with -mgeneral-regs-only is significanty faster
+//     and the gap is only 40-90%.
+//
+// October 2016.
+//
+// Originally it was reckoned that it makes no sense to implement NEON
+// version of SHA256 for 64-bit processors. This is because performance
+// improvement on most wide-spread Cortex-A5x processors was observed
+// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+// observed that 32-bit NEON SHA256 performs significantly better than
+// 64-bit scalar version on *some* of the more recent processors. As
+// result 64-bit NEON version of SHA256 was added to provide best
+// all-round performance. For example it executes ~30% faster on X-Gene
+// and Mongoose. [For reference, NEON version of SHA512 is bound to
+// deliver much less improvement, likely *negative* on Cortex-A5x.
+// Which is why NEON support is limited to SHA256.]
+
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern        OPENSSL_armcap_P
+.globl sha256_block_data_order
+.type  sha256_block_data_order,%function
+.align 6
+sha256_block_data_order:
+#ifndef        __KERNEL__
+# ifdef        __ILP32__
+       ldrsw   x16,.LOPENSSL_armcap_P
+# else
+       ldr     x16,.LOPENSSL_armcap_P
+# endif
+       adr     x17,.LOPENSSL_armcap_P
+       add     x16,x16,x17
+       ldr     w16,[x16]
+       tst     w16,#ARMV8_SHA256
+       b.ne    .Lv8_entry
+       tst     w16,#ARMV7_NEON
+       b.ne    .Lneon_entry
+#endif
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*4
+
+       ldp     w20,w21,[x0]                            // load context
+       ldp     w22,w23,[x0,#2*4]
+       ldp     w24,w25,[x0,#4*4]
+       add     x2,x1,x2,lsl#6  // end of input
+       ldp     w26,w27,[x0,#6*4]
+       adr     x30,.LK256
+       stp     x0,x2,[x29,#96]
+
+.Loop:
+       ldp     w3,w4,[x1],#2*4
+       ldr     w19,[x30],#4                    // *K++
+       eor     w28,w21,w22                             // magic seed
+       str     x1,[x29,#112]
+#ifndef        __AARCH64EB__
+       rev     w3,w3                   // 0
+#endif
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       eor     w6,w24,w24,ror#14
+       and     w17,w25,w24
+       bic     w19,w26,w24
+       add     w27,w27,w3                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w6,ror#11       // Sigma1(e)
+       ror     w6,w20,#2
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       eor     w17,w20,w20,ror#9
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w23,w23,w27                     // d+=h
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w6,w17,ror#13       // Sigma0(a)
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w27,w27,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w4,w4                   // 1
+#endif
+       ldp     w5,w6,[x1],#2*4
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       eor     w7,w23,w23,ror#14
+       and     w17,w24,w23
+       bic     w28,w25,w23
+       add     w26,w26,w4                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w7,ror#11       // Sigma1(e)
+       ror     w7,w27,#2
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       eor     w17,w27,w27,ror#9
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w22,w22,w26                     // d+=h
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w7,w17,ror#13       // Sigma0(a)
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w26,w26,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w5,w5                   // 2
+#endif
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       eor     w8,w22,w22,ror#14
+       and     w17,w23,w22
+       bic     w19,w24,w22
+       add     w25,w25,w5                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w8,ror#11       // Sigma1(e)
+       ror     w8,w26,#2
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       eor     w17,w26,w26,ror#9
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w21,w21,w25                     // d+=h
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w8,w17,ror#13       // Sigma0(a)
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w25,w25,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w6,w6                   // 3
+#endif
+       ldp     w7,w8,[x1],#2*4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       eor     w9,w21,w21,ror#14
+       and     w17,w22,w21
+       bic     w28,w23,w21
+       add     w24,w24,w6                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w9,ror#11       // Sigma1(e)
+       ror     w9,w25,#2
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       eor     w17,w25,w25,ror#9
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w20,w20,w24                     // d+=h
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w9,w17,ror#13       // Sigma0(a)
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w24,w24,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w7,w7                   // 4
+#endif
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       eor     w10,w20,w20,ror#14
+       and     w17,w21,w20
+       bic     w19,w22,w20
+       add     w23,w23,w7                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w10,ror#11      // Sigma1(e)
+       ror     w10,w24,#2
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       eor     w17,w24,w24,ror#9
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w27,w27,w23                     // d+=h
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w10,w17,ror#13      // Sigma0(a)
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w23,w23,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w8,w8                   // 5
+#endif
+       ldp     w9,w10,[x1],#2*4
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       eor     w11,w27,w27,ror#14
+       and     w17,w20,w27
+       bic     w28,w21,w27
+       add     w22,w22,w8                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w11,ror#11      // Sigma1(e)
+       ror     w11,w23,#2
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       eor     w17,w23,w23,ror#9
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w26,w26,w22                     // d+=h
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w11,w17,ror#13      // Sigma0(a)
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w22,w22,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w9,w9                   // 6
+#endif
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       eor     w12,w26,w26,ror#14
+       and     w17,w27,w26
+       bic     w19,w20,w26
+       add     w21,w21,w9                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w12,ror#11      // Sigma1(e)
+       ror     w12,w22,#2
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       eor     w17,w22,w22,ror#9
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w25,w25,w21                     // d+=h
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w12,w17,ror#13      // Sigma0(a)
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w21,w21,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w10,w10                 // 7
+#endif
+       ldp     w11,w12,[x1],#2*4
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       eor     w13,w25,w25,ror#14
+       and     w17,w26,w25
+       bic     w28,w27,w25
+       add     w20,w20,w10                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w13,ror#11      // Sigma1(e)
+       ror     w13,w21,#2
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       eor     w17,w21,w21,ror#9
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w24,w24,w20                     // d+=h
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w13,w17,ror#13      // Sigma0(a)
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w20,w20,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w11,w11                 // 8
+#endif
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       eor     w14,w24,w24,ror#14
+       and     w17,w25,w24
+       bic     w19,w26,w24
+       add     w27,w27,w11                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w14,ror#11      // Sigma1(e)
+       ror     w14,w20,#2
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       eor     w17,w20,w20,ror#9
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w23,w23,w27                     // d+=h
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w14,w17,ror#13      // Sigma0(a)
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w27,w27,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w12,w12                 // 9
+#endif
+       ldp     w13,w14,[x1],#2*4
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       eor     w15,w23,w23,ror#14
+       and     w17,w24,w23
+       bic     w28,w25,w23
+       add     w26,w26,w12                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w15,ror#11      // Sigma1(e)
+       ror     w15,w27,#2
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       eor     w17,w27,w27,ror#9
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w22,w22,w26                     // d+=h
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w15,w17,ror#13      // Sigma0(a)
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w26,w26,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w13,w13                 // 10
+#endif
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       eor     w0,w22,w22,ror#14
+       and     w17,w23,w22
+       bic     w19,w24,w22
+       add     w25,w25,w13                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w0,ror#11       // Sigma1(e)
+       ror     w0,w26,#2
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       eor     w17,w26,w26,ror#9
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w21,w21,w25                     // d+=h
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w0,w17,ror#13       // Sigma0(a)
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w25,w25,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w14,w14                 // 11
+#endif
+       ldp     w15,w0,[x1],#2*4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       str     w6,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       eor     w6,w21,w21,ror#14
+       and     w17,w22,w21
+       bic     w28,w23,w21
+       add     w24,w24,w14                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w6,ror#11       // Sigma1(e)
+       ror     w6,w25,#2
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       eor     w17,w25,w25,ror#9
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w20,w20,w24                     // d+=h
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w6,w17,ror#13       // Sigma0(a)
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w24,w24,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w15,w15                 // 12
+#endif
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       str     w7,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       eor     w7,w20,w20,ror#14
+       and     w17,w21,w20
+       bic     w19,w22,w20
+       add     w23,w23,w15                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w7,ror#11       // Sigma1(e)
+       ror     w7,w24,#2
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       eor     w17,w24,w24,ror#9
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w27,w27,w23                     // d+=h
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w7,w17,ror#13       // Sigma0(a)
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w23,w23,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w0,w0                   // 13
+#endif
+       ldp     w1,w2,[x1]
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       str     w8,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       eor     w8,w27,w27,ror#14
+       and     w17,w20,w27
+       bic     w28,w21,w27
+       add     w22,w22,w0                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w8,ror#11       // Sigma1(e)
+       ror     w8,w23,#2
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       eor     w17,w23,w23,ror#9
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w26,w26,w22                     // d+=h
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w8,w17,ror#13       // Sigma0(a)
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w22,w22,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w1,w1                   // 14
+#endif
+       ldr     w6,[sp,#12]
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       str     w9,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       eor     w9,w26,w26,ror#14
+       and     w17,w27,w26
+       bic     w19,w20,w26
+       add     w21,w21,w1                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w9,ror#11       // Sigma1(e)
+       ror     w9,w22,#2
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       eor     w17,w22,w22,ror#9
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w25,w25,w21                     // d+=h
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w9,w17,ror#13       // Sigma0(a)
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w21,w21,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w2,w2                   // 15
+#endif
+       ldr     w7,[sp,#0]
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       str     w10,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w9,w4,#7
+       and     w17,w26,w25
+       ror     w8,w1,#17
+       bic     w28,w27,w25
+       ror     w10,w21,#2
+       add     w20,w20,w2                      // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w9,w9,w4,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w10,w10,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w8,w8,w1,ror#19
+       eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w10,w21,ror#22      // Sigma0(a)
+       eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
+       add     w3,w3,w12
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w3,w3,w9
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w3,w3,w8
+.Loop_16_xx:
+       ldr     w8,[sp,#4]
+       str     w11,[sp,#0]
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       ror     w10,w5,#7
+       and     w17,w25,w24
+       ror     w9,w2,#17
+       bic     w19,w26,w24
+       ror     w11,w20,#2
+       add     w27,w27,w3                      // h+=X[i]
+       eor     w16,w16,w24,ror#11
+       eor     w10,w10,w5,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w24,ror#25      // Sigma1(e)
+       eor     w11,w11,w20,ror#13
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w9,w9,w2,ror#19
+       eor     w10,w10,w5,lsr#3        // sigma0(X[i+1])
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w11,w20,ror#22      // Sigma0(a)
+       eor     w9,w9,w2,lsr#10 // sigma1(X[i+14])
+       add     w4,w4,w13
+       add     w23,w23,w27                     // d+=h
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w4,w4,w10
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       add     w4,w4,w9
+       ldr     w9,[sp,#8]
+       str     w12,[sp,#4]
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       ror     w11,w6,#7
+       and     w17,w24,w23
+       ror     w10,w3,#17
+       bic     w28,w25,w23
+       ror     w12,w27,#2
+       add     w26,w26,w4                      // h+=X[i]
+       eor     w16,w16,w23,ror#11
+       eor     w11,w11,w6,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w23,ror#25      // Sigma1(e)
+       eor     w12,w12,w27,ror#13
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w10,w10,w3,ror#19
+       eor     w11,w11,w6,lsr#3        // sigma0(X[i+1])
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w12,w27,ror#22      // Sigma0(a)
+       eor     w10,w10,w3,lsr#10       // sigma1(X[i+14])
+       add     w5,w5,w14
+       add     w22,w22,w26                     // d+=h
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w5,w5,w11
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       add     w5,w5,w10
+       ldr     w10,[sp,#12]
+       str     w13,[sp,#8]
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       ror     w12,w7,#7
+       and     w17,w23,w22
+       ror     w11,w4,#17
+       bic     w19,w24,w22
+       ror     w13,w26,#2
+       add     w25,w25,w5                      // h+=X[i]
+       eor     w16,w16,w22,ror#11
+       eor     w12,w12,w7,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w22,ror#25      // Sigma1(e)
+       eor     w13,w13,w26,ror#13
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w11,w11,w4,ror#19
+       eor     w12,w12,w7,lsr#3        // sigma0(X[i+1])
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w13,w26,ror#22      // Sigma0(a)
+       eor     w11,w11,w4,lsr#10       // sigma1(X[i+14])
+       add     w6,w6,w15
+       add     w21,w21,w25                     // d+=h
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w6,w6,w12
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       add     w6,w6,w11
+       ldr     w11,[sp,#0]
+       str     w14,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       ror     w13,w8,#7
+       and     w17,w22,w21
+       ror     w12,w5,#17
+       bic     w28,w23,w21
+       ror     w14,w25,#2
+       add     w24,w24,w6                      // h+=X[i]
+       eor     w16,w16,w21,ror#11
+       eor     w13,w13,w8,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w21,ror#25      // Sigma1(e)
+       eor     w14,w14,w25,ror#13
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w12,w12,w5,ror#19
+       eor     w13,w13,w8,lsr#3        // sigma0(X[i+1])
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w14,w25,ror#22      // Sigma0(a)
+       eor     w12,w12,w5,lsr#10       // sigma1(X[i+14])
+       add     w7,w7,w0
+       add     w20,w20,w24                     // d+=h
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w7,w7,w13
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       add     w7,w7,w12
+       ldr     w12,[sp,#4]
+       str     w15,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       ror     w14,w9,#7
+       and     w17,w21,w20
+       ror     w13,w6,#17
+       bic     w19,w22,w20
+       ror     w15,w24,#2
+       add     w23,w23,w7                      // h+=X[i]
+       eor     w16,w16,w20,ror#11
+       eor     w14,w14,w9,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w20,ror#25      // Sigma1(e)
+       eor     w15,w15,w24,ror#13
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w13,w13,w6,ror#19
+       eor     w14,w14,w9,lsr#3        // sigma0(X[i+1])
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w15,w24,ror#22      // Sigma0(a)
+       eor     w13,w13,w6,lsr#10       // sigma1(X[i+14])
+       add     w8,w8,w1
+       add     w27,w27,w23                     // d+=h
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w8,w8,w14
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       add     w8,w8,w13
+       ldr     w13,[sp,#8]
+       str     w0,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       ror     w15,w10,#7
+       and     w17,w20,w27
+       ror     w14,w7,#17
+       bic     w28,w21,w27
+       ror     w0,w23,#2
+       add     w22,w22,w8                      // h+=X[i]
+       eor     w16,w16,w27,ror#11
+       eor     w15,w15,w10,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w27,ror#25      // Sigma1(e)
+       eor     w0,w0,w23,ror#13
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w14,w14,w7,ror#19
+       eor     w15,w15,w10,lsr#3       // sigma0(X[i+1])
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w0,w23,ror#22       // Sigma0(a)
+       eor     w14,w14,w7,lsr#10       // sigma1(X[i+14])
+       add     w9,w9,w2
+       add     w26,w26,w22                     // d+=h
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w9,w9,w15
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       add     w9,w9,w14
+       ldr     w14,[sp,#12]
+       str     w1,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       ror     w0,w11,#7
+       and     w17,w27,w26
+       ror     w15,w8,#17
+       bic     w19,w20,w26
+       ror     w1,w22,#2
+       add     w21,w21,w9                      // h+=X[i]
+       eor     w16,w16,w26,ror#11
+       eor     w0,w0,w11,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w26,ror#25      // Sigma1(e)
+       eor     w1,w1,w22,ror#13
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w15,w15,w8,ror#19
+       eor     w0,w0,w11,lsr#3 // sigma0(X[i+1])
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w1,w22,ror#22       // Sigma0(a)
+       eor     w15,w15,w8,lsr#10       // sigma1(X[i+14])
+       add     w10,w10,w3
+       add     w25,w25,w21                     // d+=h
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w10,w10,w0
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       add     w10,w10,w15
+       ldr     w15,[sp,#0]
+       str     w2,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w1,w12,#7
+       and     w17,w26,w25
+       ror     w0,w9,#17
+       bic     w28,w27,w25
+       ror     w2,w21,#2
+       add     w20,w20,w10                     // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w1,w1,w12,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w2,w2,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w0,w0,w9,ror#19
+       eor     w1,w1,w12,lsr#3 // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w2,w21,ror#22       // Sigma0(a)
+       eor     w0,w0,w9,lsr#10 // sigma1(X[i+14])
+       add     w11,w11,w4
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w11,w11,w1
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w11,w11,w0
+       ldr     w0,[sp,#4]
+       str     w3,[sp,#0]
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       ror     w2,w13,#7
+       and     w17,w25,w24
+       ror     w1,w10,#17
+       bic     w19,w26,w24
+       ror     w3,w20,#2
+       add     w27,w27,w11                     // h+=X[i]
+       eor     w16,w16,w24,ror#11
+       eor     w2,w2,w13,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w24,ror#25      // Sigma1(e)
+       eor     w3,w3,w20,ror#13
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w1,w1,w10,ror#19
+       eor     w2,w2,w13,lsr#3 // sigma0(X[i+1])
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w3,w20,ror#22       // Sigma0(a)
+       eor     w1,w1,w10,lsr#10        // sigma1(X[i+14])
+       add     w12,w12,w5
+       add     w23,w23,w27                     // d+=h
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w12,w12,w2
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       add     w12,w12,w1
+       ldr     w1,[sp,#8]
+       str     w4,[sp,#4]
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       ror     w3,w14,#7
+       and     w17,w24,w23
+       ror     w2,w11,#17
+       bic     w28,w25,w23
+       ror     w4,w27,#2
+       add     w26,w26,w12                     // h+=X[i]
+       eor     w16,w16,w23,ror#11
+       eor     w3,w3,w14,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w23,ror#25      // Sigma1(e)
+       eor     w4,w4,w27,ror#13
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w2,w2,w11,ror#19
+       eor     w3,w3,w14,lsr#3 // sigma0(X[i+1])
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w4,w27,ror#22       // Sigma0(a)
+       eor     w2,w2,w11,lsr#10        // sigma1(X[i+14])
+       add     w13,w13,w6
+       add     w22,w22,w26                     // d+=h
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w13,w13,w3
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       add     w13,w13,w2
+       ldr     w2,[sp,#12]
+       str     w5,[sp,#8]
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       ror     w4,w15,#7
+       and     w17,w23,w22
+       ror     w3,w12,#17
+       bic     w19,w24,w22
+       ror     w5,w26,#2
+       add     w25,w25,w13                     // h+=X[i]
+       eor     w16,w16,w22,ror#11
+       eor     w4,w4,w15,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w22,ror#25      // Sigma1(e)
+       eor     w5,w5,w26,ror#13
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w3,w3,w12,ror#19
+       eor     w4,w4,w15,lsr#3 // sigma0(X[i+1])
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w5,w26,ror#22       // Sigma0(a)
+       eor     w3,w3,w12,lsr#10        // sigma1(X[i+14])
+       add     w14,w14,w7
+       add     w21,w21,w25                     // d+=h
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w14,w14,w4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       add     w14,w14,w3
+       ldr     w3,[sp,#0]
+       str     w6,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       ror     w5,w0,#7
+       and     w17,w22,w21
+       ror     w4,w13,#17
+       bic     w28,w23,w21
+       ror     w6,w25,#2
+       add     w24,w24,w14                     // h+=X[i]
+       eor     w16,w16,w21,ror#11
+       eor     w5,w5,w0,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w21,ror#25      // Sigma1(e)
+       eor     w6,w6,w25,ror#13
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w4,w4,w13,ror#19
+       eor     w5,w5,w0,lsr#3  // sigma0(X[i+1])
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w6,w25,ror#22       // Sigma0(a)
+       eor     w4,w4,w13,lsr#10        // sigma1(X[i+14])
+       add     w15,w15,w8
+       add     w20,w20,w24                     // d+=h
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w15,w15,w5
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       add     w15,w15,w4
+       ldr     w4,[sp,#4]
+       str     w7,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       ror     w6,w1,#7
+       and     w17,w21,w20
+       ror     w5,w14,#17
+       bic     w19,w22,w20
+       ror     w7,w24,#2
+       add     w23,w23,w15                     // h+=X[i]
+       eor     w16,w16,w20,ror#11
+       eor     w6,w6,w1,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w20,ror#25      // Sigma1(e)
+       eor     w7,w7,w24,ror#13
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w5,w5,w14,ror#19
+       eor     w6,w6,w1,lsr#3  // sigma0(X[i+1])
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w7,w24,ror#22       // Sigma0(a)
+       eor     w5,w5,w14,lsr#10        // sigma1(X[i+14])
+       add     w0,w0,w9
+       add     w27,w27,w23                     // d+=h
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w0,w0,w6
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       add     w0,w0,w5
+       ldr     w5,[sp,#8]
+       str     w8,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       ror     w7,w2,#7
+       and     w17,w20,w27
+       ror     w6,w15,#17
+       bic     w28,w21,w27
+       ror     w8,w23,#2
+       add     w22,w22,w0                      // h+=X[i]
+       eor     w16,w16,w27,ror#11
+       eor     w7,w7,w2,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w27,ror#25      // Sigma1(e)
+       eor     w8,w8,w23,ror#13
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w6,w6,w15,ror#19
+       eor     w7,w7,w2,lsr#3  // sigma0(X[i+1])
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w8,w23,ror#22       // Sigma0(a)
+       eor     w6,w6,w15,lsr#10        // sigma1(X[i+14])
+       add     w1,w1,w10
+       add     w26,w26,w22                     // d+=h
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w1,w1,w7
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       add     w1,w1,w6
+       ldr     w6,[sp,#12]
+       str     w9,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       ror     w8,w3,#7
+       and     w17,w27,w26
+       ror     w7,w0,#17
+       bic     w19,w20,w26
+       ror     w9,w22,#2
+       add     w21,w21,w1                      // h+=X[i]
+       eor     w16,w16,w26,ror#11
+       eor     w8,w8,w3,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w26,ror#25      // Sigma1(e)
+       eor     w9,w9,w22,ror#13
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w7,w7,w0,ror#19
+       eor     w8,w8,w3,lsr#3  // sigma0(X[i+1])
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w9,w22,ror#22       // Sigma0(a)
+       eor     w7,w7,w0,lsr#10 // sigma1(X[i+14])
+       add     w2,w2,w11
+       add     w25,w25,w21                     // d+=h
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w2,w2,w8
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       add     w2,w2,w7
+       ldr     w7,[sp,#0]
+       str     w10,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w9,w4,#7
+       and     w17,w26,w25
+       ror     w8,w1,#17
+       bic     w28,w27,w25
+       ror     w10,w21,#2
+       add     w20,w20,w2                      // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w9,w9,w4,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w10,w10,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w8,w8,w1,ror#19
+       eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w10,w21,ror#22      // Sigma0(a)
+       eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
+       add     w3,w3,w12
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w3,w3,w9
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w3,w3,w8
+       cbnz    w19,.Loop_16_xx
+
+       ldp     x0,x2,[x29,#96]
+       ldr     x1,[x29,#112]
+       sub     x30,x30,#260            // rewind
+
+       ldp     w3,w4,[x0]
+       ldp     w5,w6,[x0,#2*4]
+       add     x1,x1,#14*4                     // advance input pointer
+       ldp     w7,w8,[x0,#4*4]
+       add     w20,w20,w3
+       ldp     w9,w10,[x0,#6*4]
+       add     w21,w21,w4
+       add     w22,w22,w5
+       add     w23,w23,w6
+       stp     w20,w21,[x0]
+       add     w24,w24,w7
+       add     w25,w25,w8
+       stp     w22,w23,[x0,#2*4]
+       add     w26,w26,w9
+       add     w27,w27,w10
+       cmp     x1,x2
+       stp     w24,w25,[x0,#4*4]
+       stp     w26,w27,[x0,#6*4]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*4
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  sha256_block_data_order,.-sha256_block_data_order
+
+.align 6
+.type  .LK256,%object
+.LK256:
+       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+       .long   0       //terminator
+.size  .LK256,.-.LK256
+#ifndef        __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef        __ILP32__
+       .long   OPENSSL_armcap_P-.
+# else
+       .quad   OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#ifndef        __KERNEL__
+.type  sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1             {v0.4s,v1.4s},[x0]
+       adr             x3,.LK256
+
+.Loop_hw:
+       ld1             {v4.16b-v7.16b},[x1],#64
+       sub             x2,x2,#1
+       ld1             {v16.4s},[x3],#16
+       rev32           v4.16b,v4.16b
+       rev32           v5.16b,v5.16b
+       rev32           v6.16b,v6.16b
+       rev32           v7.16b,v7.16b
+       orr             v18.16b,v0.16b,v0.16b           // offload
+       orr             v19.16b,v1.16b,v1.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+
+       ld1             {v17.4s},[x3]
+       add             v16.4s,v16.4s,v6.4s
+       sub             x3,x3,#64*4-16  // rewind
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+
+       add             v17.4s,v17.4s,v7.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+
+       add             v0.4s,v0.4s,v18.4s
+       add             v1.4s,v1.4s,v19.4s
+
+       cbnz            x2,.Loop_hw
+
+       st1             {v0.4s,v1.4s},[x0]
+
+       ldr             x29,[sp],#16
+       ret
+.size  sha256_block_armv8,.-sha256_block_armv8
+#endif
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type  sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+       stp     x29, x30, [sp, #-16]!
+       mov     x29, sp
+       sub     sp,sp,#16*4
+
+       adr     x16,.LK256
+       add     x2,x1,x2,lsl#6  // len to point at the end of inp
+
+       ld1     {v0.16b},[x1], #16
+       ld1     {v1.16b},[x1], #16
+       ld1     {v2.16b},[x1], #16
+       ld1     {v3.16b},[x1], #16
+       ld1     {v4.4s},[x16], #16
+       ld1     {v5.4s},[x16], #16
+       ld1     {v6.4s},[x16], #16
+       ld1     {v7.4s},[x16], #16
+       rev32   v0.16b,v0.16b           // yes, even on
+       rev32   v1.16b,v1.16b           // big-endian
+       rev32   v2.16b,v2.16b
+       rev32   v3.16b,v3.16b
+       mov     x17,sp
+       add     v4.4s,v4.4s,v0.4s
+       add     v5.4s,v5.4s,v1.4s
+       add     v6.4s,v6.4s,v2.4s
+       st1     {v4.4s-v5.4s},[x17], #32
+       add     v7.4s,v7.4s,v3.4s
+       st1     {v6.4s-v7.4s},[x17]
+       sub     x17,x17,#32
+
+       ldp     w3,w4,[x0]
+       ldp     w5,w6,[x0,#8]
+       ldp     w7,w8,[x0,#16]
+       ldp     w9,w10,[x0,#24]
+       ldr     w12,[sp,#0]
+       mov     w13,wzr
+       eor     w14,w4,w5
+       mov     w15,wzr
+       b       .L_00_48
+
+.align 4
+.L_00_48:
+       ext     v4.16b,v0.16b,v1.16b,#4
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       bic     w15,w9,w7
+       ext     v7.16b,v2.16b,v3.16b,#4
+       eor     w11,w7,w7,ror#5
+       add     w3,w3,w13
+       mov     d19,v3.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w3,w3,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w10,w10,w12
+       add     v0.4s,v0.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w10,w10,w11
+       ldr     w12,[sp,#4]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w4
+       ushr    v16.4s,v19.4s,#17
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       add     v0.4s,v0.4s,v5.4s
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w9,w9,w11
+       ldr     w12,[sp,#8]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       eor     v17.16b,v17.16b,v7.16b
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       add     v0.4s,v0.4s,v17.4s
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       ushr    v18.4s,v0.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v0.4s,#10
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       sli     v18.4s,v0.4s,#15
+       add     w8,w8,w12
+       ushr    v17.4s,v0.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       sli     v17.4s,v0.4s,#13
+       ldr     w12,[sp,#12]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w4,w4,w8
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w10
+       eor     v17.16b,v17.16b,v17.16b
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       add     v0.4s,v0.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     v4.4s,v4.4s,v0.4s
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v1.16b,v2.16b,#4
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       bic     w15,w5,w3
+       ext     v7.16b,v3.16b,v0.16b,#4
+       eor     w11,w3,w3,ror#5
+       add     w7,w7,w13
+       mov     d19,v0.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w7,w7,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w6,w6,w12
+       add     v1.4s,v1.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w6,w6,w11
+       ldr     w12,[sp,#20]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w8
+       ushr    v16.4s,v19.4s,#17
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       add     v1.4s,v1.4s,v5.4s
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w5,w5,w11
+       ldr     w12,[sp,#24]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       eor     v17.16b,v17.16b,v7.16b
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       add     v1.4s,v1.4s,v17.4s
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       ushr    v18.4s,v1.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v1.4s,#10
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       sli     v18.4s,v1.4s,#15
+       add     w4,w4,w12
+       ushr    v17.4s,v1.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       sli     v17.4s,v1.4s,#13
+       ldr     w12,[sp,#28]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w8,w8,w4
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w6
+       eor     v17.16b,v17.16b,v17.16b
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       add     v1.4s,v1.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     v4.4s,v4.4s,v1.4s
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[sp,#32]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v2.16b,v3.16b,#4
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       bic     w15,w9,w7
+       ext     v7.16b,v0.16b,v1.16b,#4
+       eor     w11,w7,w7,ror#5
+       add     w3,w3,w13
+       mov     d19,v1.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w3,w3,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w10,w10,w12
+       add     v2.4s,v2.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w10,w10,w11
+       ldr     w12,[sp,#36]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w4
+       ushr    v16.4s,v19.4s,#17
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       add     v2.4s,v2.4s,v5.4s
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w9,w9,w11
+       ldr     w12,[sp,#40]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       eor     v17.16b,v17.16b,v7.16b
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       add     v2.4s,v2.4s,v17.4s
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       ushr    v18.4s,v2.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v2.4s,#10
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       sli     v18.4s,v2.4s,#15
+       add     w8,w8,w12
+       ushr    v17.4s,v2.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       sli     v17.4s,v2.4s,#13
+       ldr     w12,[sp,#44]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w4,w4,w8
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w10
+       eor     v17.16b,v17.16b,v17.16b
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       add     v2.4s,v2.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     v4.4s,v4.4s,v2.4s
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#48]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v3.16b,v0.16b,#4
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       bic     w15,w5,w3
+       ext     v7.16b,v1.16b,v2.16b,#4
+       eor     w11,w3,w3,ror#5
+       add     w7,w7,w13
+       mov     d19,v2.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w7,w7,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w6,w6,w12
+       add     v3.4s,v3.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w6,w6,w11
+       ldr     w12,[sp,#52]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w8
+       ushr    v16.4s,v19.4s,#17
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       add     v3.4s,v3.4s,v5.4s
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w5,w5,w11
+       ldr     w12,[sp,#56]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       eor     v17.16b,v17.16b,v7.16b
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       add     v3.4s,v3.4s,v17.4s
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       ushr    v18.4s,v3.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v3.4s,#10
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       sli     v18.4s,v3.4s,#15
+       add     w4,w4,w12
+       ushr    v17.4s,v3.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       sli     v17.4s,v3.4s,#13
+       ldr     w12,[sp,#60]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w8,w8,w4
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w6
+       eor     v17.16b,v17.16b,v17.16b
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       add     v3.4s,v3.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     v4.4s,v4.4s,v3.4s
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[x16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       cmp     w12,#0                          // check for K256 terminator
+       ldr     w12,[sp,#0]
+       sub     x17,x17,#64
+       bne     .L_00_48
+
+       sub     x16,x16,#256            // rewind x16
+       cmp     x1,x2
+       mov     x17, #64
+       csel    x17, x17, xzr, eq
+       sub     x1,x1,x17                       // avoid SEGV
+       mov     x17,sp
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       ld1     {v0.16b},[x1],#16
+       bic     w15,w9,w7
+       eor     w11,w7,w7,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w3,w3,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       eor     w15,w3,w3,ror#11
+       rev32   v0.16b,v0.16b
+       add     w10,w10,w12
+       ror     w11,w11,#6
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       add     v4.4s,v4.4s,v0.4s
+       add     w10,w10,w11
+       ldr     w12,[sp,#4]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       eor     w14,w14,w4
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       add     w9,w9,w11
+       ldr     w12,[sp,#8]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       add     w8,w8,w12
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       ldr     w12,[sp,#12]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w4,w4,w8
+       eor     w14,w14,w10
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       ld1     {v1.16b},[x1],#16
+       bic     w15,w5,w3
+       eor     w11,w3,w3,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w7,w7,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       eor     w15,w7,w7,ror#11
+       rev32   v1.16b,v1.16b
+       add     w6,w6,w12
+       ror     w11,w11,#6
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       add     v4.4s,v4.4s,v1.4s
+       add     w6,w6,w11
+       ldr     w12,[sp,#20]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       eor     w14,w14,w8
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       add     w5,w5,w11
+       ldr     w12,[sp,#24]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       add     w4,w4,w12
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       ldr     w12,[sp,#28]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w8,w8,w4
+       eor     w14,w14,w6
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[sp,#32]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       ld1     {v2.16b},[x1],#16
+       bic     w15,w9,w7
+       eor     w11,w7,w7,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w3,w3,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       eor     w15,w3,w3,ror#11
+       rev32   v2.16b,v2.16b
+       add     w10,w10,w12
+       ror     w11,w11,#6
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       add     v4.4s,v4.4s,v2.4s
+       add     w10,w10,w11
+       ldr     w12,[sp,#36]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       eor     w14,w14,w4
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       add     w9,w9,w11
+       ldr     w12,[sp,#40]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       add     w8,w8,w12
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       ldr     w12,[sp,#44]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w4,w4,w8
+       eor     w14,w14,w10
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#48]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       ld1     {v3.16b},[x1],#16
+       bic     w15,w5,w3
+       eor     w11,w3,w3,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w7,w7,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       eor     w15,w7,w7,ror#11
+       rev32   v3.16b,v3.16b
+       add     w6,w6,w12
+       ror     w11,w11,#6
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       add     v4.4s,v4.4s,v3.4s
+       add     w6,w6,w11
+       ldr     w12,[sp,#52]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       eor     w14,w14,w8
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       add     w5,w5,w11
+       ldr     w12,[sp,#56]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       add     w4,w4,w12
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       ldr     w12,[sp,#60]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w8,w8,w4
+       eor     w14,w14,w6
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       add     w3,w3,w15                       // h+=Sigma0(a) from the past
+       ldp     w11,w12,[x0,#0]
+       add     w3,w3,w13                       // h+=Maj(a,b,c) from the past
+       ldp     w13,w14,[x0,#8]
+       add     w3,w3,w11                       // accumulate
+       add     w4,w4,w12
+       ldp     w11,w12,[x0,#16]
+       add     w5,w5,w13
+       add     w6,w6,w14
+       ldp     w13,w14,[x0,#24]
+       add     w7,w7,w11
+       add     w8,w8,w12
+        ldr    w12,[sp,#0]
+       stp     w3,w4,[x0,#0]
+       add     w9,w9,w13
+        mov    w13,wzr
+       stp     w5,w6,[x0,#8]
+       add     w10,w10,w14
+       stp     w7,w8,[x0,#16]
+        eor    w14,w4,w5
+       stp     w9,w10,[x0,#24]
+        mov    w15,wzr
+        mov    x17,sp
+       b.ne    .L_00_48
+
+       ldr     x29,[x29]
+       add     sp,sp,#16*4+16
+       ret
+.size  sha256_block_neon,.-sha256_block_neon
+#ifndef        __KERNEL__
+.comm  OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
new file mode 100644 (file)
index 0000000..a2226f8
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64
+ *
+ * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha256_base.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+
+MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64");
+MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha256");
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+                                       unsigned int num_blks);
+
+asmlinkage void sha256_block_neon(u32 *digest, const void *data,
+                                 unsigned int num_blks);
+
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+                        unsigned int len)
+{
+       return sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_data_order);
+}
+
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+                       unsigned int len, u8 *out)
+{
+       if (len)
+               sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_data_order);
+       sha256_base_do_finalize(desc,
+                               (sha256_block_fn *)sha256_block_data_order);
+
+       return sha256_base_finish(desc, out);
+}
+
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+       return sha256_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg algs[] = { {
+       .digestsize             = SHA256_DIGEST_SIZE,
+       .init                   = sha256_base_init,
+       .update                 = sha256_update,
+       .final                  = sha256_final,
+       .finup                  = sha256_finup,
+       .descsize               = sizeof(struct sha256_state),
+       .base.cra_name          = "sha256",
+       .base.cra_driver_name   = "sha256-arm64",
+       .base.cra_priority      = 100,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA256_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+}, {
+       .digestsize             = SHA224_DIGEST_SIZE,
+       .init                   = sha224_base_init,
+       .update                 = sha256_update,
+       .final                  = sha256_final,
+       .finup                  = sha256_finup,
+       .descsize               = sizeof(struct sha256_state),
+       .base.cra_name          = "sha224",
+       .base.cra_driver_name   = "sha224-arm64",
+       .base.cra_priority      = 100,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA224_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+} };
+
+static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
+                             unsigned int len)
+{
+       /*
+        * Stacking and unstacking a substantial slice of the NEON register
+        * file may significantly affect performance for small updates when
+        * executing in interrupt context, so fall back to the scalar code
+        * in that case.
+        */
+       if (!may_use_simd())
+               return sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_data_order);
+
+       kernel_neon_begin();
+       sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_neon);
+       kernel_neon_end();
+
+       return 0;
+}
+
+static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
+                            unsigned int len, u8 *out)
+{
+       if (!may_use_simd()) {
+               if (len)
+                       sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_data_order);
+               sha256_base_do_finalize(desc,
+                               (sha256_block_fn *)sha256_block_data_order);
+       } else {
+               kernel_neon_begin();
+               if (len)
+                       sha256_base_do_update(desc, data, len,
+                               (sha256_block_fn *)sha256_block_neon);
+               sha256_base_do_finalize(desc,
+                               (sha256_block_fn *)sha256_block_neon);
+               kernel_neon_end();
+       }
+       return sha256_base_finish(desc, out);
+}
+
+static int sha256_final_neon(struct shash_desc *desc, u8 *out)
+{
+       return sha256_finup_neon(desc, NULL, 0, out);
+}
+
+static struct shash_alg neon_algs[] = { {
+       .digestsize             = SHA256_DIGEST_SIZE,
+       .init                   = sha256_base_init,
+       .update                 = sha256_update_neon,
+       .final                  = sha256_final_neon,
+       .finup                  = sha256_finup_neon,
+       .descsize               = sizeof(struct sha256_state),
+       .base.cra_name          = "sha256",
+       .base.cra_driver_name   = "sha256-arm64-neon",
+       .base.cra_priority      = 150,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA256_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+}, {
+       .digestsize             = SHA224_DIGEST_SIZE,
+       .init                   = sha224_base_init,
+       .update                 = sha256_update_neon,
+       .final                  = sha256_final_neon,
+       .finup                  = sha256_finup_neon,
+       .descsize               = sizeof(struct sha256_state),
+       .base.cra_name          = "sha224",
+       .base.cra_driver_name   = "sha224-arm64-neon",
+       .base.cra_priority      = 150,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA224_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+} };
+
+static int __init sha256_mod_init(void)
+{
+       int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+       if (ret)
+               return ret;
+
+       if (elf_hwcap & HWCAP_ASIMD) {
+               ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
+               if (ret)
+                       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+       }
+       return ret;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+       if (elf_hwcap & HWCAP_ASIMD)
+               crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
+       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
new file mode 100644 (file)
index 0000000..c55efb3
--- /dev/null
@@ -0,0 +1,778 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPLv2 terms is granted.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#              SHA256-hw       SHA256(*)       SHA512
+# Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
+# Cortex-A53   2.38            15.5 (+115%)    10.0 (+150%(***))
+# Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
+# Denver       2.01            10.5 (+26%)     6.70 (+8%)
+# X-Gene                       20.0 (+100%)    12.8 (+300%(***))
+# Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
+#
+# (*)  Software SHA256 results are of lesser relevance, presented
+#      mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+#      10% (or by 1 cycle per round), but at the cost of 20% loss
+#      on Cortex-A53 (or by 4 cycles per round).
+# (***)        Super-impressive coefficients over gcc-generated code are
+#      indication of some compiler "pathology", most notably code
+#      generated with -mgeneral-regs-only is significanty faster
+#      and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" $xlate $flavour $output";
+    *STDOUT=*OUT;
+} else {
+    open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+       $BITS=512;
+       $SZ=8;
+       @Sigma0=(28,34,39);
+       @Sigma1=(14,18,41);
+       @sigma0=(1,  8, 7);
+       @sigma1=(19,61, 6);
+       $rounds=80;
+       $reg_t="x";
+} else {
+       $BITS=256;
+       $SZ=4;
+       @Sigma0=( 2,13,22);
+       @Sigma1=( 6,11,25);
+       @sigma0=( 7,18, 3);
+       @sigma1=(17,19,10);
+       $rounds=64;
+       $reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___   if ($i<16);
+#ifndef        __AARCH64EB__
+       rev     @X[$i],@X[$i]                   // $i
+#endif
+___
+$code.=<<___   if ($i<13 && ($i&1));
+       ldp     @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___   if ($i==13);
+       ldp     @X[14],@X[15],[$inp]
+___
+$code.=<<___   if ($i>=14);
+       ldr     @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___   if ($i>0 && $i<16);
+       add     $a,$a,$t1                       // h+=Sigma0(a)
+___
+$code.=<<___   if ($i>=11);
+       str     @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___   if ($i<15);
+       ror     $t0,$e,#$Sigma1[0]
+       add     $h,$h,$t2                       // h+=K[i]
+       eor     $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+       and     $t1,$f,$e
+       bic     $t2,$g,$e
+       add     $h,$h,@X[$i&15]                 // h+=X[i]
+       orr     $t1,$t1,$t2                     // Ch(e,f,g)
+       eor     $t2,$a,$b                       // a^b, b^c in next round
+       eor     $t0,$t0,$T0,ror#$Sigma1[1]      // Sigma1(e)
+       ror     $T0,$a,#$Sigma0[0]
+       add     $h,$h,$t1                       // h+=Ch(e,f,g)
+       eor     $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+       add     $h,$h,$t0                       // h+=Sigma1(e)
+       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
+       add     $d,$d,$h                        // d+=h
+       eor     $t3,$t3,$b                      // Maj(a,b,c)
+       eor     $t1,$T0,$t1,ror#$Sigma0[1]      // Sigma0(a)
+       add     $h,$h,$t3                       // h+=Maj(a,b,c)
+       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
+       //add   $h,$h,$t1                       // h+=Sigma0(a)
+___
+$code.=<<___   if ($i>=15);
+       ror     $t0,$e,#$Sigma1[0]
+       add     $h,$h,$t2                       // h+=K[i]
+       ror     $T1,@X[($j+1)&15],#$sigma0[0]
+       and     $t1,$f,$e
+       ror     $T2,@X[($j+14)&15],#$sigma1[0]
+       bic     $t2,$g,$e
+       ror     $T0,$a,#$Sigma0[0]
+       add     $h,$h,@X[$i&15]                 // h+=X[i]
+       eor     $t0,$t0,$e,ror#$Sigma1[1]
+       eor     $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+       orr     $t1,$t1,$t2                     // Ch(e,f,g)
+       eor     $t2,$a,$b                       // a^b, b^c in next round
+       eor     $t0,$t0,$e,ror#$Sigma1[2]       // Sigma1(e)
+       eor     $T0,$T0,$a,ror#$Sigma0[1]
+       add     $h,$h,$t1                       // h+=Ch(e,f,g)
+       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
+       eor     $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+       eor     $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]    // sigma0(X[i+1])
+       add     $h,$h,$t0                       // h+=Sigma1(e)
+       eor     $t3,$t3,$b                      // Maj(a,b,c)
+       eor     $t1,$T0,$a,ror#$Sigma0[2]       // Sigma0(a)
+       eor     $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]   // sigma1(X[i+14])
+       add     @X[$j],@X[$j],@X[($j+9)&15]
+       add     $d,$d,$h                        // d+=h
+       add     $h,$h,$t3                       // h+=Maj(a,b,c)
+       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
+       add     @X[$j],@X[$j],$T1
+       add     $h,$h,$t1                       // h+=Sigma0(a)
+       add     @X[$j],@X[$j],$T2
+___
+       ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern        OPENSSL_armcap_P
+.globl $func
+.type  $func,%function
+.align 6
+$func:
+___
+$code.=<<___   if ($SZ==4);
+#ifndef        __KERNEL__
+# ifdef        __ILP32__
+       ldrsw   x16,.LOPENSSL_armcap_P
+# else
+       ldr     x16,.LOPENSSL_armcap_P
+# endif
+       adr     x17,.LOPENSSL_armcap_P
+       add     x16,x16,x17
+       ldr     w16,[x16]
+       tst     w16,#ARMV8_SHA256
+       b.ne    .Lv8_entry
+       tst     w16,#ARMV7_NEON
+       b.ne    .Lneon_entry
+#endif
+___
+$code.=<<___;
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*$SZ
+
+       ldp     $A,$B,[$ctx]                            // load context
+       ldp     $C,$D,[$ctx,#2*$SZ]
+       ldp     $E,$F,[$ctx,#4*$SZ]
+       add     $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+       ldp     $G,$H,[$ctx,#6*$SZ]
+       adr     $Ktbl,.LK$BITS
+       stp     $ctx,$num,[x29,#96]
+
+.Loop:
+       ldp     @X[0],@X[1],[$inp],#2*$SZ
+       ldr     $t2,[$Ktbl],#$SZ                        // *K++
+       eor     $t3,$B,$C                               // magic seed
+       str     $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)  { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)      { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       cbnz    $t2,.Loop_16_xx
+
+       ldp     $ctx,$num,[x29,#96]
+       ldr     $inp,[x29,#112]
+       sub     $Ktbl,$Ktbl,#`$SZ*($rounds+1)`          // rewind
+
+       ldp     @X[0],@X[1],[$ctx]
+       ldp     @X[2],@X[3],[$ctx,#2*$SZ]
+       add     $inp,$inp,#14*$SZ                       // advance input pointer
+       ldp     @X[4],@X[5],[$ctx,#4*$SZ]
+       add     $A,$A,@X[0]
+       ldp     @X[6],@X[7],[$ctx,#6*$SZ]
+       add     $B,$B,@X[1]
+       add     $C,$C,@X[2]
+       add     $D,$D,@X[3]
+       stp     $A,$B,[$ctx]
+       add     $E,$E,@X[4]
+       add     $F,$F,@X[5]
+       stp     $C,$D,[$ctx,#2*$SZ]
+       add     $G,$G,@X[6]
+       add     $H,$H,@X[7]
+       cmp     $inp,$num
+       stp     $E,$F,[$ctx,#4*$SZ]
+       stp     $G,$H,[$ctx,#6*$SZ]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*$SZ
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  $func,.-$func
+
+.align 6
+.type  .LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+       .quad   0       // terminator
+___
+$code.=<<___ if ($SZ==4);
+       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+       .long   0       //terminator
+___
+$code.=<<___;
+.size  .LK$BITS,.-.LK$BITS
+#ifndef        __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef        __ILP32__
+       .long   OPENSSL_armcap_P-.
+# else
+       .quad   OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef        __KERNEL__
+.type  sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1.32          {$ABCD,$EFGH},[$ctx]
+       adr             $Ktbl,.LK256
+
+.Loop_hw:
+       ld1             {@MSG[0]-@MSG[3]},[$inp],#64
+       sub             $num,$num,#1
+       ld1.32          {$W0},[$Ktbl],#16
+       rev32           @MSG[0],@MSG[0]
+       rev32           @MSG[1],@MSG[1]
+       rev32           @MSG[2],@MSG[2]
+       rev32           @MSG[3],@MSG[3]
+       orr             $ABCD_SAVE,$ABCD,$ABCD          // offload
+       orr             $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+       ld1.32          {$W1},[$Ktbl],#16
+       add.i32         $W0,$W0,@MSG[0]
+       sha256su0       @MSG[0],@MSG[1]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+       sha256su1       @MSG[0],@MSG[2],@MSG[3]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       ld1.32          {$W1},[$Ktbl],#16
+       add.i32         $W0,$W0,@MSG[0]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       ld1.32          {$W0},[$Ktbl],#16
+       add.i32         $W1,$W1,@MSG[1]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       ld1.32          {$W1},[$Ktbl]
+       add.i32         $W0,$W0,@MSG[2]
+       sub             $Ktbl,$Ktbl,#$rounds*$SZ-16     // rewind
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       add.i32         $W1,$W1,@MSG[3]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       add.i32         $ABCD,$ABCD,$ABCD_SAVE
+       add.i32         $EFGH,$EFGH,$EFGH_SAVE
+
+       cbnz            $num,.Loop_hw
+
+       st1.32          {$ABCD,$EFGH},[$ctx]
+
+       ldr             x29,[sp],#16
+       ret
+.size  sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) {  ######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &ext_8          ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ext_8          ($T3,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &mov            (&Dscalar($T7),&Dhi(@X[3]));    # X[14..15]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ushr_32        ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+       &ushr_32        ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T3);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+       &sli_32         ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ushr_32        ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &sli_32         ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T4,$T7,$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T4,$T7,32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T5,$T7,$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T3,$T7,$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_u32      ($T3,$T7,32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T5,$T5,$T3);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T5);      # X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T6,@X[0],$sigma1[0]);
+        eval(shift(@insns));
+         &ushr_32      ($T7,@X[0],$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T6,@X[0],32-$sigma1[0]);
+        eval(shift(@insns));
+         &ushr_32      ($T5,@X[0],$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T7,$T7,$T6);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T5,@X[0],32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_32         ("{$T0}","[$Ktbl], #16");
+        eval(shift(@insns));
+         &eor_8        ($T7,$T7,$T5);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T5,$T5,$T5);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &mov            (&Dhi($T5), &Dlo($T7));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T5);      # X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         ($T0,$T0,@X[0]);
+        while($#insns>=1) { eval(shift(@insns)); }
+       &st1_32         ("{$T0}","[$Xfer], #16");
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_8          ("{@X[0]}","[$inp],#16");
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_32         ("{$T0}","[$Ktbl],#16");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &rev32          (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &st1_32         ("{$T0}","[$Xfer], #16");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&add   ($a,$a,$t4);'.                  # h+=Sigma0(a) from the past
+       '&and   ($t1,$f,$e)',
+       '&bic   ($t4,$g,$e)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&orr   ($t1,$t1,$t4)',                 # Ch(e,f,g)
+       '&eor   ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ror   ($t0,$t0,"#$Sigma1[0]")',
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t0)',                   # h+=Sigma1(e)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&ror   ($t4,$t4,"#$Sigma0[0]")',
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type  sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+       stp     x29, x30, [sp, #-16]!
+       mov     x29, sp
+       sub     sp,sp,#16*4
+
+       adr     $Ktbl,.LK256
+       add     $num,$inp,$num,lsl#6    // len to point at the end of inp
+
+       ld1.8   {@X[0]},[$inp], #16
+       ld1.8   {@X[1]},[$inp], #16
+       ld1.8   {@X[2]},[$inp], #16
+       ld1.8   {@X[3]},[$inp], #16
+       ld1.32  {$T0},[$Ktbl], #16
+       ld1.32  {$T1},[$Ktbl], #16
+       ld1.32  {$T2},[$Ktbl], #16
+       ld1.32  {$T3},[$Ktbl], #16
+       rev32   @X[0],@X[0]             // yes, even on
+       rev32   @X[1],@X[1]             // big-endian
+       rev32   @X[2],@X[2]
+       rev32   @X[3],@X[3]
+       mov     $Xfer,sp
+       add.32  $T0,$T0,@X[0]
+       add.32  $T1,$T1,@X[1]
+       add.32  $T2,$T2,@X[2]
+       st1.32  {$T0-$T1},[$Xfer], #32
+       add.32  $T3,$T3,@X[3]
+       st1.32  {$T2-$T3},[$Xfer]
+       sub     $Xfer,$Xfer,#32
+
+       ldp     $A,$B,[$ctx]
+       ldp     $C,$D,[$ctx,#8]
+       ldp     $E,$F,[$ctx,#16]
+       ldp     $G,$H,[$ctx,#24]
+       ldr     $t1,[sp,#0]
+       mov     $t2,wzr
+       eor     $t3,$B,$C
+       mov     $t4,wzr
+       b       .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       cmp     $t1,#0                          // check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       sub     $Ktbl,$Ktbl,#256                // rewind $Ktbl
+       cmp     $inp,$num
+       mov     $Xfer, #64
+       csel    $Xfer, $Xfer, xzr, eq
+       sub     $inp,$inp,$Xfer                 // avoid SEGV
+       mov     $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       add     $A,$A,$t4                       // h+=Sigma0(a) from the past
+       ldp     $t0,$t1,[$ctx,#0]
+       add     $A,$A,$t2                       // h+=Maj(a,b,c) from the past
+       ldp     $t2,$t3,[$ctx,#8]
+       add     $A,$A,$t0                       // accumulate
+       add     $B,$B,$t1
+       ldp     $t0,$t1,[$ctx,#16]
+       add     $C,$C,$t2
+       add     $D,$D,$t3
+       ldp     $t2,$t3,[$ctx,#24]
+       add     $E,$E,$t0
+       add     $F,$F,$t1
+        ldr    $t1,[sp,#0]
+       stp     $A,$B,[$ctx,#0]
+       add     $G,$G,$t2
+        mov    $t2,wzr
+       stp     $C,$D,[$ctx,#8]
+       add     $H,$H,$t3
+       stp     $E,$F,[$ctx,#16]
+        eor    $t3,$B,$C
+       stp     $G,$H,[$ctx,#24]
+        mov    $t4,wzr
+        mov    $Xfer,sp
+       b.ne    .L_00_48
+
+       ldr     x29,[x29]
+       add     sp,sp,#16*4+16
+       ret
+.size  sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef        __KERNEL__
+.comm  OPENSSL_armcap_P,4,4
+#endif
+___
+
+{   my  %opcode = (
+       "sha256h"       => 0x5e004000,  "sha256h2"      => 0x5e005000,
+       "sha256su0"     => 0x5e282800,  "sha256su1"     => 0x5e006000   );
+
+    sub unsha256 {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                       $mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+       s/\`([^\`]*)\`/eval($1)/ge;
+
+       s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+       s/\bq([0-9]+)\b/v$1.16b/g;              # old->new registers
+
+       s/\.[ui]?8(\s)/$1/;
+       s/\.\w?32\b//           and s/\.16b/\.4s/g;
+       m/(ld|st)1[^\[]+\[0\]/  and s/\.4s/\.s/g;
+
+       print $_,"\n";
+}
+
+close STDOUT;
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
new file mode 100644 (file)
index 0000000..bd0f59f
--- /dev/null
@@ -0,0 +1,1085 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//             SHA256-hw       SHA256(*)       SHA512
+// Apple A7    1.97            10.5 (+33%)     6.73 (-1%(**))
+// Cortex-A53  2.38            15.5 (+115%)    10.0 (+150%(***))
+// Cortex-A57  2.31            11.6 (+86%)     7.51 (+260%(***))
+// Denver      2.01            10.5 (+26%)     6.70 (+8%)
+// X-Gene                      20.0 (+100%)    12.8 (+300%(***))
+// Mongoose    2.36            13.0 (+50%)     8.36 (+33%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+//     mostly for informational purposes.
+// (**)        The result is a trade-off: it's possible to improve it by
+//     10% (or by 1 cycle per round), but at the cost of 20% loss
+//     on Cortex-A53 (or by 4 cycles per round).
+// (***)       Super-impressive coefficients over gcc-generated code are
+//     indication of some compiler "pathology", most notably code
+//     generated with -mgeneral-regs-only is significanty faster
+//     and the gap is only 40-90%.
+//
+// October 2016.
+//
+// Originally it was reckoned that it makes no sense to implement NEON
+// version of SHA256 for 64-bit processors. This is because performance
+// improvement on most wide-spread Cortex-A5x processors was observed
+// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+// observed that 32-bit NEON SHA256 performs significantly better than
+// 64-bit scalar version on *some* of the more recent processors. As
+// result 64-bit NEON version of SHA256 was added to provide best
+// all-round performance. For example it executes ~30% faster on X-Gene
+// and Mongoose. [For reference, NEON version of SHA512 is bound to
+// deliver much less improvement, likely *negative* on Cortex-A5x.
+// Which is why NEON support is limited to SHA256.]
+
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern        OPENSSL_armcap_P
+.globl sha512_block_data_order
+.type  sha512_block_data_order,%function
+.align 6
+sha512_block_data_order:
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*8
+
+       ldp     x20,x21,[x0]                            // load context
+       ldp     x22,x23,[x0,#2*8]
+       ldp     x24,x25,[x0,#4*8]
+       add     x2,x1,x2,lsl#7  // end of input
+       ldp     x26,x27,[x0,#6*8]
+       adr     x30,.LK512
+       stp     x0,x2,[x29,#96]
+
+.Loop:
+       ldp     x3,x4,[x1],#2*8
+       ldr     x19,[x30],#8                    // *K++
+       eor     x28,x21,x22                             // magic seed
+       str     x1,[x29,#112]
+#ifndef        __AARCH64EB__
+       rev     x3,x3                   // 0
+#endif
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       eor     x6,x24,x24,ror#23
+       and     x17,x25,x24
+       bic     x19,x26,x24
+       add     x27,x27,x3                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x6,ror#18       // Sigma1(e)
+       ror     x6,x20,#28
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       eor     x17,x20,x20,ror#5
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x23,x23,x27                     // d+=h
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x6,x17,ror#34       // Sigma0(a)
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x27,x27,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x4,x4                   // 1
+#endif
+       ldp     x5,x6,[x1],#2*8
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       eor     x7,x23,x23,ror#23
+       and     x17,x24,x23
+       bic     x28,x25,x23
+       add     x26,x26,x4                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x7,ror#18       // Sigma1(e)
+       ror     x7,x27,#28
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       eor     x17,x27,x27,ror#5
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x22,x22,x26                     // d+=h
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x7,x17,ror#34       // Sigma0(a)
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x26,x26,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x5,x5                   // 2
+#endif
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       eor     x8,x22,x22,ror#23
+       and     x17,x23,x22
+       bic     x19,x24,x22
+       add     x25,x25,x5                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x8,ror#18       // Sigma1(e)
+       ror     x8,x26,#28
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       eor     x17,x26,x26,ror#5
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x21,x21,x25                     // d+=h
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x8,x17,ror#34       // Sigma0(a)
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x25,x25,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x6,x6                   // 3
+#endif
+       ldp     x7,x8,[x1],#2*8
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       eor     x9,x21,x21,ror#23
+       and     x17,x22,x21
+       bic     x28,x23,x21
+       add     x24,x24,x6                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x9,ror#18       // Sigma1(e)
+       ror     x9,x25,#28
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       eor     x17,x25,x25,ror#5
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x20,x20,x24                     // d+=h
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x9,x17,ror#34       // Sigma0(a)
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x24,x24,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x7,x7                   // 4
+#endif
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       eor     x10,x20,x20,ror#23
+       and     x17,x21,x20
+       bic     x19,x22,x20
+       add     x23,x23,x7                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x10,ror#18      // Sigma1(e)
+       ror     x10,x24,#28
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       eor     x17,x24,x24,ror#5
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x27,x27,x23                     // d+=h
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x10,x17,ror#34      // Sigma0(a)
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x23,x23,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x8,x8                   // 5
+#endif
+       ldp     x9,x10,[x1],#2*8
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       eor     x11,x27,x27,ror#23
+       and     x17,x20,x27
+       bic     x28,x21,x27
+       add     x22,x22,x8                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x11,ror#18      // Sigma1(e)
+       ror     x11,x23,#28
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       eor     x17,x23,x23,ror#5
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x26,x26,x22                     // d+=h
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x11,x17,ror#34      // Sigma0(a)
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x22,x22,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x9,x9                   // 6
+#endif
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       eor     x12,x26,x26,ror#23
+       and     x17,x27,x26
+       bic     x19,x20,x26
+       add     x21,x21,x9                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x12,ror#18      // Sigma1(e)
+       ror     x12,x22,#28
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       eor     x17,x22,x22,ror#5
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x25,x25,x21                     // d+=h
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x12,x17,ror#34      // Sigma0(a)
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x21,x21,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x10,x10                 // 7
+#endif
+       ldp     x11,x12,[x1],#2*8
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       eor     x13,x25,x25,ror#23
+       and     x17,x26,x25
+       bic     x28,x27,x25
+       add     x20,x20,x10                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x13,ror#18      // Sigma1(e)
+       ror     x13,x21,#28
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       eor     x17,x21,x21,ror#5
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x24,x24,x20                     // d+=h
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x13,x17,ror#34      // Sigma0(a)
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x20,x20,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x11,x11                 // 8
+#endif
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       eor     x14,x24,x24,ror#23
+       and     x17,x25,x24
+       bic     x19,x26,x24
+       add     x27,x27,x11                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x14,ror#18      // Sigma1(e)
+       ror     x14,x20,#28
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       eor     x17,x20,x20,ror#5
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x23,x23,x27                     // d+=h
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x14,x17,ror#34      // Sigma0(a)
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x27,x27,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x12,x12                 // 9
+#endif
+       ldp     x13,x14,[x1],#2*8
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       eor     x15,x23,x23,ror#23
+       and     x17,x24,x23
+       bic     x28,x25,x23
+       add     x26,x26,x12                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x15,ror#18      // Sigma1(e)
+       ror     x15,x27,#28
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       eor     x17,x27,x27,ror#5
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x22,x22,x26                     // d+=h
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x15,x17,ror#34      // Sigma0(a)
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x26,x26,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x13,x13                 // 10
+#endif
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       eor     x0,x22,x22,ror#23
+       and     x17,x23,x22
+       bic     x19,x24,x22
+       add     x25,x25,x13                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x0,ror#18       // Sigma1(e)
+       ror     x0,x26,#28
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       eor     x17,x26,x26,ror#5
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x21,x21,x25                     // d+=h
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x0,x17,ror#34       // Sigma0(a)
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x25,x25,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x14,x14                 // 11
+#endif
+       ldp     x15,x0,[x1],#2*8
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       str     x6,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       eor     x6,x21,x21,ror#23
+       and     x17,x22,x21
+       bic     x28,x23,x21
+       add     x24,x24,x14                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x6,ror#18       // Sigma1(e)
+       ror     x6,x25,#28
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       eor     x17,x25,x25,ror#5
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x20,x20,x24                     // d+=h
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x6,x17,ror#34       // Sigma0(a)
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x24,x24,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x15,x15                 // 12
+#endif
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       str     x7,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       eor     x7,x20,x20,ror#23
+       and     x17,x21,x20
+       bic     x19,x22,x20
+       add     x23,x23,x15                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x7,ror#18       // Sigma1(e)
+       ror     x7,x24,#28
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       eor     x17,x24,x24,ror#5
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x27,x27,x23                     // d+=h
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x7,x17,ror#34       // Sigma0(a)
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x23,x23,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x0,x0                   // 13
+#endif
+       ldp     x1,x2,[x1]
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       str     x8,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       eor     x8,x27,x27,ror#23
+       and     x17,x20,x27
+       bic     x28,x21,x27
+       add     x22,x22,x0                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x8,ror#18       // Sigma1(e)
+       ror     x8,x23,#28
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       eor     x17,x23,x23,ror#5
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x26,x26,x22                     // d+=h
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x8,x17,ror#34       // Sigma0(a)
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x22,x22,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x1,x1                   // 14
+#endif
+       ldr     x6,[sp,#24]
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       str     x9,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       eor     x9,x26,x26,ror#23
+       and     x17,x27,x26
+       bic     x19,x20,x26
+       add     x21,x21,x1                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x9,ror#18       // Sigma1(e)
+       ror     x9,x22,#28
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       eor     x17,x22,x22,ror#5
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x25,x25,x21                     // d+=h
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x9,x17,ror#34       // Sigma0(a)
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x21,x21,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x2,x2                   // 15
+#endif
+       ldr     x7,[sp,#0]
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       str     x10,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x9,x4,#1
+       and     x17,x26,x25
+       ror     x8,x1,#19
+       bic     x28,x27,x25
+       ror     x10,x21,#28
+       add     x20,x20,x2                      // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x9,x9,x4,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x10,x10,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x8,x8,x1,ror#61
+       eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x10,x21,ror#39      // Sigma0(a)
+       eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
+       add     x3,x3,x12
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x3,x3,x9
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x3,x3,x8
+.Loop_16_xx:
+       ldr     x8,[sp,#8]
+       str     x11,[sp,#0]
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       ror     x10,x5,#1
+       and     x17,x25,x24
+       ror     x9,x2,#19
+       bic     x19,x26,x24
+       ror     x11,x20,#28
+       add     x27,x27,x3                      // h+=X[i]
+       eor     x16,x16,x24,ror#18
+       eor     x10,x10,x5,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x24,ror#41      // Sigma1(e)
+       eor     x11,x11,x20,ror#34
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x9,x9,x2,ror#61
+       eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x11,x20,ror#39      // Sigma0(a)
+       eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
+       add     x4,x4,x13
+       add     x23,x23,x27                     // d+=h
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x4,x4,x10
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       add     x4,x4,x9
+       ldr     x9,[sp,#16]
+       str     x12,[sp,#8]
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       ror     x11,x6,#1
+       and     x17,x24,x23
+       ror     x10,x3,#19
+       bic     x28,x25,x23
+       ror     x12,x27,#28
+       add     x26,x26,x4                      // h+=X[i]
+       eor     x16,x16,x23,ror#18
+       eor     x11,x11,x6,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x23,ror#41      // Sigma1(e)
+       eor     x12,x12,x27,ror#34
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x10,x10,x3,ror#61
+       eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x12,x27,ror#39      // Sigma0(a)
+       eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
+       add     x5,x5,x14
+       add     x22,x22,x26                     // d+=h
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x5,x5,x11
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       add     x5,x5,x10
+       ldr     x10,[sp,#24]
+       str     x13,[sp,#16]
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       ror     x12,x7,#1
+       and     x17,x23,x22
+       ror     x11,x4,#19
+       bic     x19,x24,x22
+       ror     x13,x26,#28
+       add     x25,x25,x5                      // h+=X[i]
+       eor     x16,x16,x22,ror#18
+       eor     x12,x12,x7,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x22,ror#41      // Sigma1(e)
+       eor     x13,x13,x26,ror#34
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x11,x11,x4,ror#61
+       eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x13,x26,ror#39      // Sigma0(a)
+       eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
+       add     x6,x6,x15
+       add     x21,x21,x25                     // d+=h
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x6,x6,x12
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       add     x6,x6,x11
+       ldr     x11,[sp,#0]
+       str     x14,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       ror     x13,x8,#1
+       and     x17,x22,x21
+       ror     x12,x5,#19
+       bic     x28,x23,x21
+       ror     x14,x25,#28
+       add     x24,x24,x6                      // h+=X[i]
+       eor     x16,x16,x21,ror#18
+       eor     x13,x13,x8,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x21,ror#41      // Sigma1(e)
+       eor     x14,x14,x25,ror#34
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x12,x12,x5,ror#61
+       eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x14,x25,ror#39      // Sigma0(a)
+       eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
+       add     x7,x7,x0
+       add     x20,x20,x24                     // d+=h
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x7,x7,x13
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       add     x7,x7,x12
+       ldr     x12,[sp,#8]
+       str     x15,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       ror     x14,x9,#1
+       and     x17,x21,x20
+       ror     x13,x6,#19
+       bic     x19,x22,x20
+       ror     x15,x24,#28
+       add     x23,x23,x7                      // h+=X[i]
+       eor     x16,x16,x20,ror#18
+       eor     x14,x14,x9,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x20,ror#41      // Sigma1(e)
+       eor     x15,x15,x24,ror#34
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x13,x13,x6,ror#61
+       eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x15,x24,ror#39      // Sigma0(a)
+       eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
+       add     x8,x8,x1
+       add     x27,x27,x23                     // d+=h
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x8,x8,x14
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       add     x8,x8,x13
+       ldr     x13,[sp,#16]
+       str     x0,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       ror     x15,x10,#1
+       and     x17,x20,x27
+       ror     x14,x7,#19
+       bic     x28,x21,x27
+       ror     x0,x23,#28
+       add     x22,x22,x8                      // h+=X[i]
+       eor     x16,x16,x27,ror#18
+       eor     x15,x15,x10,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x27,ror#41      // Sigma1(e)
+       eor     x0,x0,x23,ror#34
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x14,x14,x7,ror#61
+       eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x0,x23,ror#39       // Sigma0(a)
+       eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
+       add     x9,x9,x2
+       add     x26,x26,x22                     // d+=h
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x9,x9,x15
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       add     x9,x9,x14
+       ldr     x14,[sp,#24]
+       str     x1,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       ror     x0,x11,#1
+       and     x17,x27,x26
+       ror     x15,x8,#19
+       bic     x19,x20,x26
+       ror     x1,x22,#28
+       add     x21,x21,x9                      // h+=X[i]
+       eor     x16,x16,x26,ror#18
+       eor     x0,x0,x11,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x26,ror#41      // Sigma1(e)
+       eor     x1,x1,x22,ror#34
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x15,x15,x8,ror#61
+       eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x1,x22,ror#39       // Sigma0(a)
+       eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
+       add     x10,x10,x3
+       add     x25,x25,x21                     // d+=h
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x10,x10,x0
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       add     x10,x10,x15
+       ldr     x15,[sp,#0]
+       str     x2,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x1,x12,#1
+       and     x17,x26,x25
+       ror     x0,x9,#19
+       bic     x28,x27,x25
+       ror     x2,x21,#28
+       add     x20,x20,x10                     // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x1,x1,x12,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x2,x2,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x0,x0,x9,ror#61
+       eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x2,x21,ror#39       // Sigma0(a)
+       eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
+       add     x11,x11,x4
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x11,x11,x1
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x11,x11,x0
+       ldr     x0,[sp,#8]
+       str     x3,[sp,#0]
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       ror     x2,x13,#1
+       and     x17,x25,x24
+       ror     x1,x10,#19
+       bic     x19,x26,x24
+       ror     x3,x20,#28
+       add     x27,x27,x11                     // h+=X[i]
+       eor     x16,x16,x24,ror#18
+       eor     x2,x2,x13,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x24,ror#41      // Sigma1(e)
+       eor     x3,x3,x20,ror#34
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x1,x1,x10,ror#61
+       eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x3,x20,ror#39       // Sigma0(a)
+       eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
+       add     x12,x12,x5
+       add     x23,x23,x27                     // d+=h
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x12,x12,x2
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       add     x12,x12,x1
+       ldr     x1,[sp,#16]
+       str     x4,[sp,#8]
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       ror     x3,x14,#1
+       and     x17,x24,x23
+       ror     x2,x11,#19
+       bic     x28,x25,x23
+       ror     x4,x27,#28
+       add     x26,x26,x12                     // h+=X[i]
+       eor     x16,x16,x23,ror#18
+       eor     x3,x3,x14,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x23,ror#41      // Sigma1(e)
+       eor     x4,x4,x27,ror#34
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x2,x2,x11,ror#61
+       eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x4,x27,ror#39       // Sigma0(a)
+       eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
+       add     x13,x13,x6
+       add     x22,x22,x26                     // d+=h
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x13,x13,x3
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       add     x13,x13,x2
+       ldr     x2,[sp,#24]
+       str     x5,[sp,#16]
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       ror     x4,x15,#1
+       and     x17,x23,x22
+       ror     x3,x12,#19
+       bic     x19,x24,x22
+       ror     x5,x26,#28
+       add     x25,x25,x13                     // h+=X[i]
+       eor     x16,x16,x22,ror#18
+       eor     x4,x4,x15,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x22,ror#41      // Sigma1(e)
+       eor     x5,x5,x26,ror#34
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x3,x3,x12,ror#61
+       eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x5,x26,ror#39       // Sigma0(a)
+       eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
+       add     x14,x14,x7
+       add     x21,x21,x25                     // d+=h
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x14,x14,x4
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       add     x14,x14,x3
+       ldr     x3,[sp,#0]
+       str     x6,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       ror     x5,x0,#1
+       and     x17,x22,x21
+       ror     x4,x13,#19
+       bic     x28,x23,x21
+       ror     x6,x25,#28
+       add     x24,x24,x14                     // h+=X[i]
+       eor     x16,x16,x21,ror#18
+       eor     x5,x5,x0,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x21,ror#41      // Sigma1(e)
+       eor     x6,x6,x25,ror#34
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x4,x4,x13,ror#61
+       eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x6,x25,ror#39       // Sigma0(a)
+       eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
+       add     x15,x15,x8
+       add     x20,x20,x24                     // d+=h
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x15,x15,x5
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       add     x15,x15,x4
+       ldr     x4,[sp,#8]
+       str     x7,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       ror     x6,x1,#1
+       and     x17,x21,x20
+       ror     x5,x14,#19
+       bic     x19,x22,x20
+       ror     x7,x24,#28
+       add     x23,x23,x15                     // h+=X[i]
+       eor     x16,x16,x20,ror#18
+       eor     x6,x6,x1,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x20,ror#41      // Sigma1(e)
+       eor     x7,x7,x24,ror#34
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x5,x5,x14,ror#61
+       eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x7,x24,ror#39       // Sigma0(a)
+       eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
+       add     x0,x0,x9
+       add     x27,x27,x23                     // d+=h
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x0,x0,x6
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       add     x0,x0,x5
+       ldr     x5,[sp,#16]
+       str     x8,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       ror     x7,x2,#1
+       and     x17,x20,x27
+       ror     x6,x15,#19
+       bic     x28,x21,x27
+       ror     x8,x23,#28
+       add     x22,x22,x0                      // h+=X[i]
+       eor     x16,x16,x27,ror#18
+       eor     x7,x7,x2,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x27,ror#41      // Sigma1(e)
+       eor     x8,x8,x23,ror#34
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x6,x6,x15,ror#61
+       eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x8,x23,ror#39       // Sigma0(a)
+       eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
+       add     x1,x1,x10
+       add     x26,x26,x22                     // d+=h
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x1,x1,x7
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       add     x1,x1,x6
+       ldr     x6,[sp,#24]
+       str     x9,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       ror     x8,x3,#1
+       and     x17,x27,x26
+       ror     x7,x0,#19
+       bic     x19,x20,x26
+       ror     x9,x22,#28
+       add     x21,x21,x1                      // h+=X[i]
+       eor     x16,x16,x26,ror#18
+       eor     x8,x8,x3,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x26,ror#41      // Sigma1(e)
+       eor     x9,x9,x22,ror#34
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x7,x7,x0,ror#61
+       eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x9,x22,ror#39       // Sigma0(a)
+       eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
+       add     x2,x2,x11
+       add     x25,x25,x21                     // d+=h
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x2,x2,x8
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       add     x2,x2,x7
+       ldr     x7,[sp,#0]
+       str     x10,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x9,x4,#1
+       and     x17,x26,x25
+       ror     x8,x1,#19
+       bic     x28,x27,x25
+       ror     x10,x21,#28
+       add     x20,x20,x2                      // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x9,x9,x4,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x10,x10,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x8,x8,x1,ror#61
+       eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x10,x21,ror#39      // Sigma0(a)
+       eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
+       add     x3,x3,x12
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x3,x3,x9
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x3,x3,x8
+       cbnz    x19,.Loop_16_xx
+
+       ldp     x0,x2,[x29,#96]
+       ldr     x1,[x29,#112]
+       sub     x30,x30,#648            // rewind
+
+       ldp     x3,x4,[x0]
+       ldp     x5,x6,[x0,#2*8]
+       add     x1,x1,#14*8                     // advance input pointer
+       ldp     x7,x8,[x0,#4*8]
+       add     x20,x20,x3
+       ldp     x9,x10,[x0,#6*8]
+       add     x21,x21,x4
+       add     x22,x22,x5
+       add     x23,x23,x6
+       stp     x20,x21,[x0]
+       add     x24,x24,x7
+       add     x25,x25,x8
+       stp     x22,x23,[x0,#2*8]
+       add     x26,x26,x9
+       add     x27,x27,x10
+       cmp     x1,x2
+       stp     x24,x25,[x0,#4*8]
+       stp     x26,x27,[x0,#6*8]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*8
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  sha512_block_data_order,.-sha512_block_data_order
+
+.align 6
+.type  .LK512,%object
+.LK512:
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+       .quad   0       // terminator
+.size  .LK512,.-.LK512
+#ifndef        __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef        __ILP32__
+       .long   OPENSSL_armcap_P-.
+# else
+       .quad   OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#ifndef        __KERNEL__
+.comm  OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
new file mode 100644 (file)
index 0000000..aff35c9
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64
+ *
+ * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <asm/neon.h>
+
+MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
+MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+
+asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
+                                       unsigned int num_blks);
+
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+                        unsigned int len)
+{
+       return sha512_base_do_update(desc, data, len,
+                       (sha512_block_fn *)sha512_block_data_order);
+}
+
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+                       unsigned int len, u8 *out)
+{
+       if (len)
+               sha512_base_do_update(desc, data, len,
+                       (sha512_block_fn *)sha512_block_data_order);
+       sha512_base_do_finalize(desc,
+                       (sha512_block_fn *)sha512_block_data_order);
+
+       return sha512_base_finish(desc, out);
+}
+
+static int sha512_final(struct shash_desc *desc, u8 *out)
+{
+       return sha512_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg algs[] = { {
+       .digestsize             = SHA512_DIGEST_SIZE,
+       .init                   = sha512_base_init,
+       .update                 = sha512_update,
+       .final                  = sha512_final,
+       .finup                  = sha512_finup,
+       .descsize               = sizeof(struct sha512_state),
+       .base.cra_name          = "sha512",
+       .base.cra_driver_name   = "sha512-arm64",
+       .base.cra_priority      = 150,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA512_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+}, {
+       .digestsize             = SHA384_DIGEST_SIZE,
+       .init                   = sha384_base_init,
+       .update                 = sha512_update,
+       .final                  = sha512_final,
+       .finup                  = sha512_finup,
+       .descsize               = sizeof(struct sha512_state),
+       .base.cra_name          = "sha384",
+       .base.cra_driver_name   = "sha384-arm64",
+       .base.cra_priority      = 150,
+       .base.cra_flags         = CRYPTO_ALG_TYPE_SHASH,
+       .base.cra_blocksize     = SHA384_BLOCK_SIZE,
+       .base.cra_module        = THIS_MODULE,
+} };
+
+static int __init sha512_mod_init(void)
+{
+       return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_mod_fini(void)
+{
+       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(sha512_mod_init);
+module_exit(sha512_mod_fini);