crypto: x86/salsa20 - remove x86 salsa20 implementations
authorEric Biggers <ebiggers@google.com>
Sat, 26 May 2018 07:08:58 +0000 (00:08 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 17 Jul 2018 09:39:31 +0000 (11:39 +0200)
commit b7b73cd5d74694ed59abcdb4974dacb4ff8b2a2a upstream.

The x86 assembly implementations of Salsa20 use the frame base pointer
register (%ebp or %rbp), which breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.
Recent (v4.10+) kernels will warn about this, e.g.

WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c
[...]

But after looking into it, I believe there's very little reason to still
retain the x86 Salsa20 code.  First, these are *not* vectorized
(SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere
close to the best Salsa20 performance on any remotely modern x86
processor; they're just regular x86 assembly.  Second, it's still
unclear that anyone is actually using the kernel's Salsa20 at all,
especially given that now ChaCha20 is supported too, and with much more
efficient SSSE3 and AVX2 implementations.  Finally, in benchmarks I did
on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the
x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic
(~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm
is only slightly faster than salsa20-generic (~15% faster on Skylake,
~20% faster on Zen).  The gcc version made little difference.

So, the x86_64 salsa20-asm is pretty clearly useless.  That leaves just
the i686 salsa20-asm, which based on my tests provides a 15-20% speed
boost.  But that's without updating the code to not use %ebp.  And given
the maintenance cost, the small speed difference vs. salsa20-generic,
the fact that few people still use i686 kernels, the doubt that anyone
is even using the kernel's Salsa20 at all, and the fact that a SSE2
implementation would almost certainly be much faster on any remotely
modern x86 processor yet no one has cared enough to add one yet, I don't
think it's worthwhile to keep.

Thus, just remove both the x86_64 and i686 salsa20-asm implementations.

Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
arch/x86/crypto/Makefile
arch/x86/crypto/salsa20-i586-asm_32.S [deleted file]
arch/x86/crypto/salsa20-x86_64-asm_64.S [deleted file]
arch/x86/crypto/salsa20_glue.c [deleted file]
crypto/Kconfig

index 5f07333bb224c12c8c8d23407faa1265db9afa7b..9c903a420cdab7e86f480f11335c10ceea75d512 100644 (file)
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -59,7 +57,6 @@ endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -68,7 +65,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644 (file)
index 329452b..0000000
+++ /dev/null
@@ -1,1114 +0,0 @@
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
-
-#include <linux/linkage.h>
-
-.text
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-       mov     %esp,%eax
-       and     $31,%eax
-       add     $256,%eax
-       sub     %eax,%esp
-       # eax_stack = eax
-       movl    %eax,80(%esp)
-       # ebx_stack = ebx
-       movl    %ebx,84(%esp)
-       # esi_stack = esi
-       movl    %esi,88(%esp)
-       # edi_stack = edi
-       movl    %edi,92(%esp)
-       # ebp_stack = ebp
-       movl    %ebp,96(%esp)
-       # x = arg1
-       movl    4(%esp,%eax),%edx
-       # m = arg2
-       movl    8(%esp,%eax),%esi
-       # out = arg3
-       movl    12(%esp,%eax),%edi
-       # bytes = arg4
-       movl    16(%esp,%eax),%ebx
-       # bytes -= 0
-       sub     $0,%ebx
-       # goto done if unsigned<=
-       jbe     ._done
-._start:
-       # in0 = *(uint32 *) (x + 0)
-       movl    0(%edx),%eax
-       # in1 = *(uint32 *) (x + 4)
-       movl    4(%edx),%ecx
-       # in2 = *(uint32 *) (x + 8)
-       movl    8(%edx),%ebp
-       # j0 = in0
-       movl    %eax,164(%esp)
-       # in3 = *(uint32 *) (x + 12)
-       movl    12(%edx),%eax
-       # j1 = in1
-       movl    %ecx,168(%esp)
-       # in4 = *(uint32 *) (x + 16)
-       movl    16(%edx),%ecx
-       # j2 = in2
-       movl    %ebp,172(%esp)
-       # in5 = *(uint32 *) (x + 20)
-       movl    20(%edx),%ebp
-       # j3 = in3
-       movl    %eax,176(%esp)
-       # in6 = *(uint32 *) (x + 24)
-       movl    24(%edx),%eax
-       # j4 = in4
-       movl    %ecx,180(%esp)
-       # in7 = *(uint32 *) (x + 28)
-       movl    28(%edx),%ecx
-       # j5 = in5
-       movl    %ebp,184(%esp)
-       # in8 = *(uint32 *) (x + 32)
-       movl    32(%edx),%ebp
-       # j6 = in6
-       movl    %eax,188(%esp)
-       # in9 = *(uint32 *) (x + 36)
-       movl    36(%edx),%eax
-       # j7 = in7
-       movl    %ecx,192(%esp)
-       # in10 = *(uint32 *) (x + 40)
-       movl    40(%edx),%ecx
-       # j8 = in8
-       movl    %ebp,196(%esp)
-       # in11 = *(uint32 *) (x + 44)
-       movl    44(%edx),%ebp
-       # j9 = in9
-       movl    %eax,200(%esp)
-       # in12 = *(uint32 *) (x + 48)
-       movl    48(%edx),%eax
-       # j10 = in10
-       movl    %ecx,204(%esp)
-       # in13 = *(uint32 *) (x + 52)
-       movl    52(%edx),%ecx
-       # j11 = in11
-       movl    %ebp,208(%esp)
-       # in14 = *(uint32 *) (x + 56)
-       movl    56(%edx),%ebp
-       # j12 = in12
-       movl    %eax,212(%esp)
-       # in15 = *(uint32 *) (x + 60)
-       movl    60(%edx),%eax
-       # j13 = in13
-       movl    %ecx,216(%esp)
-       # j14 = in14
-       movl    %ebp,220(%esp)
-       # j15 = in15
-       movl    %eax,224(%esp)
-       # x_backup = x
-       movl    %edx,64(%esp)
-._bytesatleast1:
-       #   bytes - 64
-       cmp     $64,%ebx
-       #   goto nocopy if unsigned>=
-       jae     ._nocopy
-       #     ctarget = out
-       movl    %edi,228(%esp)
-       #     out = &tmp
-       leal    0(%esp),%edi
-       #     i = bytes
-       mov     %ebx,%ecx
-       #     while (i) { *out++ = *m++; --i }
-       rep     movsb
-       #     out = &tmp
-       leal    0(%esp),%edi
-       #     m = &tmp
-       leal    0(%esp),%esi
-._nocopy:
-       #   out_backup = out
-       movl    %edi,72(%esp)
-       #   m_backup = m
-       movl    %esi,68(%esp)
-       #   bytes_backup = bytes
-       movl    %ebx,76(%esp)
-       #   in0 = j0
-       movl    164(%esp),%eax
-       #   in1 = j1
-       movl    168(%esp),%ecx
-       #   in2 = j2
-       movl    172(%esp),%edx
-       #   in3 = j3
-       movl    176(%esp),%ebx
-       #   x0 = in0
-       movl    %eax,100(%esp)
-       #   x1 = in1
-       movl    %ecx,104(%esp)
-       #   x2 = in2
-       movl    %edx,108(%esp)
-       #   x3 = in3
-       movl    %ebx,112(%esp)
-       #   in4 = j4
-       movl    180(%esp),%eax
-       #   in5 = j5
-       movl    184(%esp),%ecx
-       #   in6 = j6
-       movl    188(%esp),%edx
-       #   in7 = j7
-       movl    192(%esp),%ebx
-       #   x4 = in4
-       movl    %eax,116(%esp)
-       #   x5 = in5
-       movl    %ecx,120(%esp)
-       #   x6 = in6
-       movl    %edx,124(%esp)
-       #   x7 = in7
-       movl    %ebx,128(%esp)
-       #   in8 = j8
-       movl    196(%esp),%eax
-       #   in9 = j9
-       movl    200(%esp),%ecx
-       #   in10 = j10
-       movl    204(%esp),%edx
-       #   in11 = j11
-       movl    208(%esp),%ebx
-       #   x8 = in8
-       movl    %eax,132(%esp)
-       #   x9 = in9
-       movl    %ecx,136(%esp)
-       #   x10 = in10
-       movl    %edx,140(%esp)
-       #   x11 = in11
-       movl    %ebx,144(%esp)
-       #   in12 = j12
-       movl    212(%esp),%eax
-       #   in13 = j13
-       movl    216(%esp),%ecx
-       #   in14 = j14
-       movl    220(%esp),%edx
-       #   in15 = j15
-       movl    224(%esp),%ebx
-       #   x12 = in12
-       movl    %eax,148(%esp)
-       #   x13 = in13
-       movl    %ecx,152(%esp)
-       #   x14 = in14
-       movl    %edx,156(%esp)
-       #   x15 = in15
-       movl    %ebx,160(%esp)
-       #   i = 20
-       mov     $20,%ebp
-       # p = x0
-       movl    100(%esp),%eax
-       # s = x5
-       movl    120(%esp),%ecx
-       # t = x10
-       movl    140(%esp),%edx
-       # w = x15
-       movl    160(%esp),%ebx
-._mainloop:
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x12
-       addl    148(%esp),%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x6
-       addl    124(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x1
-       movl    104(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x11
-       movl    144(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p <<<= 7
-       rol     $7,%eax
-       # p ^= x4
-       xorl    116(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x14
-       xorl    156(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x9
-       xorl    136(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x3
-       xorl    112(%esp),%edi
-       # x4 = p
-       movl    %eax,116(%esp)
-       #                               x14 = t
-       movl    %edx,156(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x9 = r
-       movl    %esi,136(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x3 = v
-       movl    %edi,112(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x8
-       xorl    132(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x2
-       xorl    108(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x13
-       xorl    152(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x7
-       xorl    128(%esp),%ebx
-       # x8 = p
-       movl    %eax,132(%esp)
-       #                               x2 = t
-       movl    %edx,108(%esp)
-       # p += x4
-       addl    116(%esp),%eax
-       #               x13 = s
-       movl    %ecx,152(%esp)
-       #                               t += x14
-       addl    156(%esp),%edx
-       #                                               x7 = w
-       movl    %ebx,128(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x12
-       xorl    148(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x6
-       xorl    124(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x1
-       xorl    104(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x11
-       xorl    144(%esp),%edi
-       # x12 = p
-       movl    %eax,148(%esp)
-       #                               x6 = t
-       movl    %edx,124(%esp)
-       # p += x8
-       addl    132(%esp),%eax
-       #               x1 = r
-       movl    %esi,104(%esp)
-       #                               t += x2
-       addl    108(%esp),%edx
-       #                                               x11 = v
-       movl    %edi,144(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x3
-       addl    112(%esp),%eax
-       # p <<<= 7
-       rol     $7,%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x9
-       addl    136(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x4
-       movl    116(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x14
-       movl    156(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p ^= x1
-       xorl    104(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x11
-       xorl    144(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x6
-       xorl    124(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x12
-       xorl    148(%esp),%edi
-       # x1 = p
-       movl    %eax,104(%esp)
-       #                               x11 = t
-       movl    %edx,144(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x6 = r
-       movl    %esi,124(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x12 = v
-       movl    %edi,148(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x2
-       xorl    108(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x8
-       xorl    132(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x7
-       xorl    128(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x13
-       xorl    152(%esp),%ebx
-       # x2 = p
-       movl    %eax,108(%esp)
-       #                               x8 = t
-       movl    %edx,132(%esp)
-       # p += x1
-       addl    104(%esp),%eax
-       #               x7 = s
-       movl    %ecx,128(%esp)
-       #                               t += x11
-       addl    144(%esp),%edx
-       #                                               x13 = w
-       movl    %ebx,152(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x3
-       xorl    112(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x9
-       xorl    136(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x4
-       xorl    116(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x14
-       xorl    156(%esp),%edi
-       # x3 = p
-       movl    %eax,112(%esp)
-       #                               x9 = t
-       movl    %edx,136(%esp)
-       # p += x2
-       addl    108(%esp),%eax
-       #               x4 = r
-       movl    %esi,116(%esp)
-       #                               t += x8
-       addl    132(%esp),%edx
-       #                                               x14 = v
-       movl    %edi,156(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x12
-       addl    148(%esp),%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x6
-       addl    124(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x1
-       movl    104(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x11
-       movl    144(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p <<<= 7
-       rol     $7,%eax
-       # p ^= x4
-       xorl    116(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x14
-       xorl    156(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x9
-       xorl    136(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x3
-       xorl    112(%esp),%edi
-       # x4 = p
-       movl    %eax,116(%esp)
-       #                               x14 = t
-       movl    %edx,156(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x9 = r
-       movl    %esi,136(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x3 = v
-       movl    %edi,112(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x8
-       xorl    132(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x2
-       xorl    108(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x13
-       xorl    152(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x7
-       xorl    128(%esp),%ebx
-       # x8 = p
-       movl    %eax,132(%esp)
-       #                               x2 = t
-       movl    %edx,108(%esp)
-       # p += x4
-       addl    116(%esp),%eax
-       #               x13 = s
-       movl    %ecx,152(%esp)
-       #                               t += x14
-       addl    156(%esp),%edx
-       #                                               x7 = w
-       movl    %ebx,128(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x12
-       xorl    148(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x6
-       xorl    124(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x1
-       xorl    104(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x11
-       xorl    144(%esp),%edi
-       # x12 = p
-       movl    %eax,148(%esp)
-       #                               x6 = t
-       movl    %edx,124(%esp)
-       # p += x8
-       addl    132(%esp),%eax
-       #               x1 = r
-       movl    %esi,104(%esp)
-       #                               t += x2
-       addl    108(%esp),%edx
-       #                                               x11 = v
-       movl    %edi,144(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # x0 = p
-       movl    %eax,100(%esp)
-       #                               x10 = t
-       movl    %edx,140(%esp)
-       # p += x3
-       addl    112(%esp),%eax
-       # p <<<= 7
-       rol     $7,%eax
-       #               x5 = s
-       movl    %ecx,120(%esp)
-       #                               t += x9
-       addl    136(%esp),%edx
-       #                                               x15 = w
-       movl    %ebx,160(%esp)
-       #               r = x4
-       movl    116(%esp),%esi
-       #               r += s
-       add     %ecx,%esi
-       #                                               v = x14
-       movl    156(%esp),%edi
-       #                                               v += w
-       add     %ebx,%edi
-       # p ^= x1
-       xorl    104(%esp),%eax
-       #                               t <<<= 7
-       rol     $7,%edx
-       #                               t ^= x11
-       xorl    144(%esp),%edx
-       #               r <<<= 7
-       rol     $7,%esi
-       #               r ^= x6
-       xorl    124(%esp),%esi
-       #                                               v <<<= 7
-       rol     $7,%edi
-       #                                               v ^= x12
-       xorl    148(%esp),%edi
-       # x1 = p
-       movl    %eax,104(%esp)
-       #                               x11 = t
-       movl    %edx,144(%esp)
-       # p += x0
-       addl    100(%esp),%eax
-       #               x6 = r
-       movl    %esi,124(%esp)
-       #                               t += x10
-       addl    140(%esp),%edx
-       #                                               x12 = v
-       movl    %edi,148(%esp)
-       # p <<<= 9
-       rol     $9,%eax
-       # p ^= x2
-       xorl    108(%esp),%eax
-       #                               t <<<= 9
-       rol     $9,%edx
-       #                               t ^= x8
-       xorl    132(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 9
-       rol     $9,%ecx
-       #               s ^= x7
-       xorl    128(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 9
-       rol     $9,%ebx
-       #                                               w ^= x13
-       xorl    152(%esp),%ebx
-       # x2 = p
-       movl    %eax,108(%esp)
-       #                               x8 = t
-       movl    %edx,132(%esp)
-       # p += x1
-       addl    104(%esp),%eax
-       #               x7 = s
-       movl    %ecx,128(%esp)
-       #                               t += x11
-       addl    144(%esp),%edx
-       #                                               x13 = w
-       movl    %ebx,152(%esp)
-       # p <<<= 13
-       rol     $13,%eax
-       # p ^= x3
-       xorl    112(%esp),%eax
-       #                               t <<<= 13
-       rol     $13,%edx
-       #                               t ^= x9
-       xorl    136(%esp),%edx
-       #               r += s
-       add     %ecx,%esi
-       #               r <<<= 13
-       rol     $13,%esi
-       #               r ^= x4
-       xorl    116(%esp),%esi
-       #                                               v += w
-       add     %ebx,%edi
-       #                                               v <<<= 13
-       rol     $13,%edi
-       #                                               v ^= x14
-       xorl    156(%esp),%edi
-       # x3 = p
-       movl    %eax,112(%esp)
-       #                               x9 = t
-       movl    %edx,136(%esp)
-       # p += x2
-       addl    108(%esp),%eax
-       #               x4 = r
-       movl    %esi,116(%esp)
-       #                               t += x8
-       addl    132(%esp),%edx
-       #                                               x14 = v
-       movl    %edi,156(%esp)
-       # p <<<= 18
-       rol     $18,%eax
-       # p ^= x0
-       xorl    100(%esp),%eax
-       #                               t <<<= 18
-       rol     $18,%edx
-       #                               t ^= x10
-       xorl    140(%esp),%edx
-       #               s += r
-       add     %esi,%ecx
-       #               s <<<= 18
-       rol     $18,%ecx
-       #               s ^= x5
-       xorl    120(%esp),%ecx
-       #                                               w += v
-       add     %edi,%ebx
-       #                                               w <<<= 18
-       rol     $18,%ebx
-       #                                               w ^= x15
-       xorl    160(%esp),%ebx
-       # i -= 4
-       sub     $4,%ebp
-       # goto mainloop if unsigned >
-       ja      ._mainloop
-       # x0 = p
-       movl    %eax,100(%esp)
-       # x5 = s
-       movl    %ecx,120(%esp)
-       # x10 = t
-       movl    %edx,140(%esp)
-       # x15 = w
-       movl    %ebx,160(%esp)
-       #   out = out_backup
-       movl    72(%esp),%edi
-       #   m = m_backup
-       movl    68(%esp),%esi
-       #   in0 = x0
-       movl    100(%esp),%eax
-       #   in1 = x1
-       movl    104(%esp),%ecx
-       #   in0 += j0
-       addl    164(%esp),%eax
-       #   in1 += j1
-       addl    168(%esp),%ecx
-       #   in0 ^= *(uint32 *) (m + 0)
-       xorl    0(%esi),%eax
-       #   in1 ^= *(uint32 *) (m + 4)
-       xorl    4(%esi),%ecx
-       #   *(uint32 *) (out + 0) = in0
-       movl    %eax,0(%edi)
-       #   *(uint32 *) (out + 4) = in1
-       movl    %ecx,4(%edi)
-       #   in2 = x2
-       movl    108(%esp),%eax
-       #   in3 = x3
-       movl    112(%esp),%ecx
-       #   in2 += j2
-       addl    172(%esp),%eax
-       #   in3 += j3
-       addl    176(%esp),%ecx
-       #   in2 ^= *(uint32 *) (m + 8)
-       xorl    8(%esi),%eax
-       #   in3 ^= *(uint32 *) (m + 12)
-       xorl    12(%esi),%ecx
-       #   *(uint32 *) (out + 8) = in2
-       movl    %eax,8(%edi)
-       #   *(uint32 *) (out + 12) = in3
-       movl    %ecx,12(%edi)
-       #   in4 = x4
-       movl    116(%esp),%eax
-       #   in5 = x5
-       movl    120(%esp),%ecx
-       #   in4 += j4
-       addl    180(%esp),%eax
-       #   in5 += j5
-       addl    184(%esp),%ecx
-       #   in4 ^= *(uint32 *) (m + 16)
-       xorl    16(%esi),%eax
-       #   in5 ^= *(uint32 *) (m + 20)
-       xorl    20(%esi),%ecx
-       #   *(uint32 *) (out + 16) = in4
-       movl    %eax,16(%edi)
-       #   *(uint32 *) (out + 20) = in5
-       movl    %ecx,20(%edi)
-       #   in6 = x6
-       movl    124(%esp),%eax
-       #   in7 = x7
-       movl    128(%esp),%ecx
-       #   in6 += j6
-       addl    188(%esp),%eax
-       #   in7 += j7
-       addl    192(%esp),%ecx
-       #   in6 ^= *(uint32 *) (m + 24)
-       xorl    24(%esi),%eax
-       #   in7 ^= *(uint32 *) (m + 28)
-       xorl    28(%esi),%ecx
-       #   *(uint32 *) (out + 24) = in6
-       movl    %eax,24(%edi)
-       #   *(uint32 *) (out + 28) = in7
-       movl    %ecx,28(%edi)
-       #   in8 = x8
-       movl    132(%esp),%eax
-       #   in9 = x9
-       movl    136(%esp),%ecx
-       #   in8 += j8
-       addl    196(%esp),%eax
-       #   in9 += j9
-       addl    200(%esp),%ecx
-       #   in8 ^= *(uint32 *) (m + 32)
-       xorl    32(%esi),%eax
-       #   in9 ^= *(uint32 *) (m + 36)
-       xorl    36(%esi),%ecx
-       #   *(uint32 *) (out + 32) = in8
-       movl    %eax,32(%edi)
-       #   *(uint32 *) (out + 36) = in9
-       movl    %ecx,36(%edi)
-       #   in10 = x10
-       movl    140(%esp),%eax
-       #   in11 = x11
-       movl    144(%esp),%ecx
-       #   in10 += j10
-       addl    204(%esp),%eax
-       #   in11 += j11
-       addl    208(%esp),%ecx
-       #   in10 ^= *(uint32 *) (m + 40)
-       xorl    40(%esi),%eax
-       #   in11 ^= *(uint32 *) (m + 44)
-       xorl    44(%esi),%ecx
-       #   *(uint32 *) (out + 40) = in10
-       movl    %eax,40(%edi)
-       #   *(uint32 *) (out + 44) = in11
-       movl    %ecx,44(%edi)
-       #   in12 = x12
-       movl    148(%esp),%eax
-       #   in13 = x13
-       movl    152(%esp),%ecx
-       #   in12 += j12
-       addl    212(%esp),%eax
-       #   in13 += j13
-       addl    216(%esp),%ecx
-       #   in12 ^= *(uint32 *) (m + 48)
-       xorl    48(%esi),%eax
-       #   in13 ^= *(uint32 *) (m + 52)
-       xorl    52(%esi),%ecx
-       #   *(uint32 *) (out + 48) = in12
-       movl    %eax,48(%edi)
-       #   *(uint32 *) (out + 52) = in13
-       movl    %ecx,52(%edi)
-       #   in14 = x14
-       movl    156(%esp),%eax
-       #   in15 = x15
-       movl    160(%esp),%ecx
-       #   in14 += j14
-       addl    220(%esp),%eax
-       #   in15 += j15
-       addl    224(%esp),%ecx
-       #   in14 ^= *(uint32 *) (m + 56)
-       xorl    56(%esi),%eax
-       #   in15 ^= *(uint32 *) (m + 60)
-       xorl    60(%esi),%ecx
-       #   *(uint32 *) (out + 56) = in14
-       movl    %eax,56(%edi)
-       #   *(uint32 *) (out + 60) = in15
-       movl    %ecx,60(%edi)
-       #   bytes = bytes_backup
-       movl    76(%esp),%ebx
-       #   in8 = j8
-       movl    196(%esp),%eax
-       #   in9 = j9
-       movl    200(%esp),%ecx
-       #   in8 += 1
-       add     $1,%eax
-       #   in9 += 0 + carry
-       adc     $0,%ecx
-       #   j8 = in8
-       movl    %eax,196(%esp)
-       #   j9 = in9
-       movl    %ecx,200(%esp)
-       #   bytes - 64
-       cmp     $64,%ebx
-       #   goto bytesatleast65 if unsigned>
-       ja      ._bytesatleast65
-       #     goto bytesatleast64 if unsigned>=
-       jae     ._bytesatleast64
-       #       m = out
-       mov     %edi,%esi
-       #       out = ctarget
-       movl    228(%esp),%edi
-       #       i = bytes
-       mov     %ebx,%ecx
-       #       while (i) { *out++ = *m++; --i }
-       rep     movsb
-._bytesatleast64:
-       #     x = x_backup
-       movl    64(%esp),%eax
-       #     in8 = j8
-       movl    196(%esp),%ecx
-       #     in9 = j9
-       movl    200(%esp),%edx
-       #     *(uint32 *) (x + 32) = in8
-       movl    %ecx,32(%eax)
-       #     *(uint32 *) (x + 36) = in9
-       movl    %edx,36(%eax)
-._done:
-       #     eax = eax_stack
-       movl    80(%esp),%eax
-       #     ebx = ebx_stack
-       movl    84(%esp),%ebx
-       #     esi = esi_stack
-       movl    88(%esp),%esi
-       #     edi = edi_stack
-       movl    92(%esp),%edi
-       #     ebp = ebp_stack
-       movl    96(%esp),%ebp
-       #     leave
-       add     %eax,%esp
-       ret
-._bytesatleast65:
-       #   bytes -= 64
-       sub     $64,%ebx
-       #   out += 64
-       add     $64,%edi
-       #   m += 64
-       add     $64,%esi
-       # goto bytesatleast1
-       jmp     ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
-       mov     %esp,%eax
-       and     $31,%eax
-       add     $256,%eax
-       sub     %eax,%esp
-       #   eax_stack = eax
-       movl    %eax,64(%esp)
-       #   ebx_stack = ebx
-       movl    %ebx,68(%esp)
-       #   esi_stack = esi
-       movl    %esi,72(%esp)
-       #   edi_stack = edi
-       movl    %edi,76(%esp)
-       #   ebp_stack = ebp
-       movl    %ebp,80(%esp)
-       #   k = arg2
-       movl    8(%esp,%eax),%ecx
-       #   kbits = arg3
-       movl    12(%esp,%eax),%edx
-       #   x = arg1
-       movl    4(%esp,%eax),%eax
-       #   in1 = *(uint32 *) (k + 0)
-       movl    0(%ecx),%ebx
-       #   in2 = *(uint32 *) (k + 4)
-       movl    4(%ecx),%esi
-       #   in3 = *(uint32 *) (k + 8)
-       movl    8(%ecx),%edi
-       #   in4 = *(uint32 *) (k + 12)
-       movl    12(%ecx),%ebp
-       #   *(uint32 *) (x + 4) = in1
-       movl    %ebx,4(%eax)
-       #   *(uint32 *) (x + 8) = in2
-       movl    %esi,8(%eax)
-       #   *(uint32 *) (x + 12) = in3
-       movl    %edi,12(%eax)
-       #   *(uint32 *) (x + 16) = in4
-       movl    %ebp,16(%eax)
-       #   kbits - 256
-       cmp     $256,%edx
-       #   goto kbits128 if unsigned<
-       jb      ._kbits128
-._kbits256:
-       #     in11 = *(uint32 *) (k + 16)
-       movl    16(%ecx),%edx
-       #     in12 = *(uint32 *) (k + 20)
-       movl    20(%ecx),%ebx
-       #     in13 = *(uint32 *) (k + 24)
-       movl    24(%ecx),%esi
-       #     in14 = *(uint32 *) (k + 28)
-       movl    28(%ecx),%ecx
-       #     *(uint32 *) (x + 44) = in11
-       movl    %edx,44(%eax)
-       #     *(uint32 *) (x + 48) = in12
-       movl    %ebx,48(%eax)
-       #     *(uint32 *) (x + 52) = in13
-       movl    %esi,52(%eax)
-       #     *(uint32 *) (x + 56) = in14
-       movl    %ecx,56(%eax)
-       #     in0 = 1634760805
-       mov     $1634760805,%ecx
-       #     in5 = 857760878
-       mov     $857760878,%edx
-       #     in10 = 2036477234
-       mov     $2036477234,%ebx
-       #     in15 = 1797285236
-       mov     $1797285236,%esi
-       #     *(uint32 *) (x + 0) = in0
-       movl    %ecx,0(%eax)
-       #     *(uint32 *) (x + 20) = in5
-       movl    %edx,20(%eax)
-       #     *(uint32 *) (x + 40) = in10
-       movl    %ebx,40(%eax)
-       #     *(uint32 *) (x + 60) = in15
-       movl    %esi,60(%eax)
-       #   goto keysetupdone
-       jmp     ._keysetupdone
-._kbits128:
-       #     in11 = *(uint32 *) (k + 0)
-       movl    0(%ecx),%edx
-       #     in12 = *(uint32 *) (k + 4)
-       movl    4(%ecx),%ebx
-       #     in13 = *(uint32 *) (k + 8)
-       movl    8(%ecx),%esi
-       #     in14 = *(uint32 *) (k + 12)
-       movl    12(%ecx),%ecx
-       #     *(uint32 *) (x + 44) = in11
-       movl    %edx,44(%eax)
-       #     *(uint32 *) (x + 48) = in12
-       movl    %ebx,48(%eax)
-       #     *(uint32 *) (x + 52) = in13
-       movl    %esi,52(%eax)
-       #     *(uint32 *) (x + 56) = in14
-       movl    %ecx,56(%eax)
-       #     in0 = 1634760805
-       mov     $1634760805,%ecx
-       #     in5 = 824206446
-       mov     $824206446,%edx
-       #     in10 = 2036477238
-       mov     $2036477238,%ebx
-       #     in15 = 1797285236
-       mov     $1797285236,%esi
-       #     *(uint32 *) (x + 0) = in0
-       movl    %ecx,0(%eax)
-       #     *(uint32 *) (x + 20) = in5
-       movl    %edx,20(%eax)
-       #     *(uint32 *) (x + 40) = in10
-       movl    %ebx,40(%eax)
-       #     *(uint32 *) (x + 60) = in15
-       movl    %esi,60(%eax)
-._keysetupdone:
-       #   eax = eax_stack
-       movl    64(%esp),%eax
-       #   ebx = ebx_stack
-       movl    68(%esp),%ebx
-       #   esi = esi_stack
-       movl    72(%esp),%esi
-       #   edi = edi_stack
-       movl    76(%esp),%edi
-       #   ebp = ebp_stack
-       movl    80(%esp),%ebp
-       # leave
-       add     %eax,%esp
-       ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
-       mov     %esp,%eax
-       and     $31,%eax
-       add     $256,%eax
-       sub     %eax,%esp
-       #   eax_stack = eax
-       movl    %eax,64(%esp)
-       #   ebx_stack = ebx
-       movl    %ebx,68(%esp)
-       #   esi_stack = esi
-       movl    %esi,72(%esp)
-       #   edi_stack = edi
-       movl    %edi,76(%esp)
-       #   ebp_stack = ebp
-       movl    %ebp,80(%esp)
-       #   iv = arg2
-       movl    8(%esp,%eax),%ecx
-       #   x = arg1
-       movl    4(%esp,%eax),%eax
-       #   in6 = *(uint32 *) (iv + 0)
-       movl    0(%ecx),%edx
-       #   in7 = *(uint32 *) (iv + 4)
-       movl    4(%ecx),%ecx
-       #   in8 = 0
-       mov     $0,%ebx
-       #   in9 = 0
-       mov     $0,%esi
-       #   *(uint32 *) (x + 24) = in6
-       movl    %edx,24(%eax)
-       #   *(uint32 *) (x + 28) = in7
-       movl    %ecx,28(%eax)
-       #   *(uint32 *) (x + 32) = in8
-       movl    %ebx,32(%eax)
-       #   *(uint32 *) (x + 36) = in9
-       movl    %esi,36(%eax)
-       #   eax = eax_stack
-       movl    64(%esp),%eax
-       #   ebx = ebx_stack
-       movl    68(%esp),%ebx
-       #   esi = esi_stack
-       movl    72(%esp),%esi
-       #   edi = edi_stack
-       movl    76(%esp),%edi
-       #   ebp = ebp_stack
-       movl    80(%esp),%ebp
-       # leave
-       add     %eax,%esp
-       ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644 (file)
index 10db30d..0000000
+++ /dev/null
@@ -1,919 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-       mov     %rsp,%r11
-       and     $31,%r11
-       add     $256,%r11
-       sub     %r11,%rsp
-       # x = arg1
-       mov     %rdi,%r8
-       # m = arg2
-       mov     %rsi,%rsi
-       # out = arg3
-       mov     %rdx,%rdi
-       # bytes = arg4
-       mov     %rcx,%rdx
-       #               unsigned>? bytes - 0
-       cmp     $0,%rdx
-       # comment:fp stack unchanged by jump
-       # goto done if !unsigned>
-       jbe     ._done
-       # comment:fp stack unchanged by fallthrough
-# start:
-._start:
-       # r11_stack = r11
-       movq    %r11,0(%rsp)
-       # r12_stack = r12
-       movq    %r12,8(%rsp)
-       # r13_stack = r13
-       movq    %r13,16(%rsp)
-       # r14_stack = r14
-       movq    %r14,24(%rsp)
-       # r15_stack = r15
-       movq    %r15,32(%rsp)
-       # rbx_stack = rbx
-       movq    %rbx,40(%rsp)
-       # rbp_stack = rbp
-       movq    %rbp,48(%rsp)
-       # in0 = *(uint64 *) (x + 0)
-       movq    0(%r8),%rcx
-       # in2 = *(uint64 *) (x + 8)
-       movq    8(%r8),%r9
-       # in4 = *(uint64 *) (x + 16)
-       movq    16(%r8),%rax
-       # in6 = *(uint64 *) (x + 24)
-       movq    24(%r8),%r10
-       # in8 = *(uint64 *) (x + 32)
-       movq    32(%r8),%r11
-       # in10 = *(uint64 *) (x + 40)
-       movq    40(%r8),%r12
-       # in12 = *(uint64 *) (x + 48)
-       movq    48(%r8),%r13
-       # in14 = *(uint64 *) (x + 56)
-       movq    56(%r8),%r14
-       # j0 = in0
-       movq    %rcx,56(%rsp)
-       # j2 = in2
-       movq    %r9,64(%rsp)
-       # j4 = in4
-       movq    %rax,72(%rsp)
-       # j6 = in6
-       movq    %r10,80(%rsp)
-       # j8 = in8
-       movq    %r11,88(%rsp)
-       # j10 = in10
-       movq    %r12,96(%rsp)
-       # j12 = in12
-       movq    %r13,104(%rsp)
-       # j14 = in14
-       movq    %r14,112(%rsp)
-       # x_backup = x
-       movq    %r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
-       #                   unsigned<? bytes - 64
-       cmp     $64,%rdx
-       # comment:fp stack unchanged by jump
-       #   goto nocopy if !unsigned<
-       jae     ._nocopy
-       #     ctarget = out
-       movq    %rdi,128(%rsp)
-       #     out = &tmp
-       leaq    192(%rsp),%rdi
-       #     i = bytes
-       mov     %rdx,%rcx
-       #     while (i) { *out++ = *m++; --i }
-       rep     movsb
-       #     out = &tmp
-       leaq    192(%rsp),%rdi
-       #     m = &tmp
-       leaq    192(%rsp),%rsi
-       # comment:fp stack unchanged by fallthrough
-#   nocopy:
-._nocopy:
-       #   out_backup = out
-       movq    %rdi,136(%rsp)
-       #   m_backup = m
-       movq    %rsi,144(%rsp)
-       #   bytes_backup = bytes
-       movq    %rdx,152(%rsp)
-       #   x1 = j0
-       movq    56(%rsp),%rdi
-       #   x0 = x1
-       mov     %rdi,%rdx
-       #   (uint64) x1 >>= 32
-       shr     $32,%rdi
-       #               x3 = j2
-       movq    64(%rsp),%rsi
-       #               x2 = x3
-       mov     %rsi,%rcx
-       #               (uint64) x3 >>= 32
-       shr     $32,%rsi
-       #   x5 = j4
-       movq    72(%rsp),%r8
-       #   x4 = x5
-       mov     %r8,%r9
-       #   (uint64) x5 >>= 32
-       shr     $32,%r8
-       #   x5_stack = x5
-       movq    %r8,160(%rsp)
-       #               x7 = j6
-       movq    80(%rsp),%r8
-       #               x6 = x7
-       mov     %r8,%rax
-       #               (uint64) x7 >>= 32
-       shr     $32,%r8
-       #   x9 = j8
-       movq    88(%rsp),%r10
-       #   x8 = x9
-       mov     %r10,%r11
-       #   (uint64) x9 >>= 32
-       shr     $32,%r10
-       #               x11 = j10
-       movq    96(%rsp),%r12
-       #               x10 = x11
-       mov     %r12,%r13
-       #               x10_stack = x10
-       movq    %r13,168(%rsp)
-       #               (uint64) x11 >>= 32
-       shr     $32,%r12
-       #   x13 = j12
-       movq    104(%rsp),%r13
-       #   x12 = x13
-       mov     %r13,%r14
-       #   (uint64) x13 >>= 32
-       shr     $32,%r13
-       #               x15 = j14
-       movq    112(%rsp),%r15
-       #               x14 = x15
-       mov     %r15,%rbx
-       #               (uint64) x15 >>= 32
-       shr     $32,%r15
-       #               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #   i = 20
-       mov     $20,%r15
-#   mainloop:
-._mainloop:
-       #   i_backup = i
-       movq    %r15,184(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x12 + x0
-       lea     (%r14,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x4 ^= a
-       xor     %rbp,%r9
-       #               b = x1 + x5
-       lea     (%rdi,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x9 ^= b
-       xor     %rbp,%r10
-       # a = x0 + x4
-       lea     (%rdx,%r9),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x8 ^= a
-       xor     %rbp,%r11
-       #               b = x5 + x9
-       lea     (%r15,%r10),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x13 ^= b
-       xor     %rbp,%r13
-       # a = x4 + x8
-       lea     (%r9,%r11),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x12 ^= a
-       xor     %rbp,%r14
-       #               b = x9 + x13
-       lea     (%r10,%r13),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x1 ^= b
-       xor     %rbp,%rdi
-       # a = x8 + x12
-       lea     (%r11,%r14),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x13 + x1
-       lea     (%r13,%rdi),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x6 + x10
-       lea     (%rax,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x14 ^= c
-       xor     %r15,%rbx
-       #                               c = x10 + x14
-       lea     (%rbp,%rbx),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x2 ^= c
-       xor     %r15,%rcx
-       #                               c = x14 + x2
-       lea     (%rbx,%rcx),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x6 ^= c
-       xor     %r15,%rax
-       #                               c = x2 + x6
-       lea     (%rcx,%rax),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x11 + x15
-       lea     (%r12,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x3 ^= d
-       xor     %rbp,%rsi
-       #                                               d = x15 + x3
-       lea     (%r15,%rsi),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x7 ^= d
-       xor     %rbp,%r8
-       #                                               d = x3 + x7
-       lea     (%rsi,%r8),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x11 ^= d
-       xor     %rbp,%r12
-       #                                               d = x7 + x11
-       lea     (%r8,%r12),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x3 + x0
-       lea     (%rsi,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x1 ^= a
-       xor     %rbp,%rdi
-       #               b = x4 + x5
-       lea     (%r9,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x6 ^= b
-       xor     %rbp,%rax
-       # a = x0 + x1
-       lea     (%rdx,%rdi),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x2 ^= a
-       xor     %rbp,%rcx
-       #               b = x5 + x6
-       lea     (%r15,%rax),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x7 ^= b
-       xor     %rbp,%r8
-       # a = x1 + x2
-       lea     (%rdi,%rcx),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x3 ^= a
-       xor     %rbp,%rsi
-       #               b = x6 + x7
-       lea     (%rax,%r8),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x4 ^= b
-       xor     %rbp,%r9
-       # a = x2 + x3
-       lea     (%rcx,%rsi),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x7 + x4
-       lea     (%r8,%r9),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x9 + x10
-       lea     (%r10,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x11 ^= c
-       xor     %r15,%r12
-       #                               c = x10 + x11
-       lea     (%rbp,%r12),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x8 ^= c
-       xor     %r15,%r11
-       #                               c = x11 + x8
-       lea     (%r12,%r11),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x9 ^= c
-       xor     %r15,%r10
-       #                               c = x8 + x9
-       lea     (%r11,%r10),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x14 + x15
-       lea     (%rbx,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x12 ^= d
-       xor     %rbp,%r14
-       #                                               d = x15 + x12
-       lea     (%r15,%r14),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x13 ^= d
-       xor     %rbp,%r13
-       #                                               d = x12 + x13
-       lea     (%r14,%r13),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x14 ^= d
-       xor     %rbp,%rbx
-       #                                               d = x13 + x14
-       lea     (%r13,%rbx),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x12 + x0
-       lea     (%r14,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x4 ^= a
-       xor     %rbp,%r9
-       #               b = x1 + x5
-       lea     (%rdi,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x9 ^= b
-       xor     %rbp,%r10
-       # a = x0 + x4
-       lea     (%rdx,%r9),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x8 ^= a
-       xor     %rbp,%r11
-       #               b = x5 + x9
-       lea     (%r15,%r10),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x13 ^= b
-       xor     %rbp,%r13
-       # a = x4 + x8
-       lea     (%r9,%r11),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x12 ^= a
-       xor     %rbp,%r14
-       #               b = x9 + x13
-       lea     (%r10,%r13),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x1 ^= b
-       xor     %rbp,%rdi
-       # a = x8 + x12
-       lea     (%r11,%r14),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x13 + x1
-       lea     (%r13,%rdi),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x6 + x10
-       lea     (%rax,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x14 ^= c
-       xor     %r15,%rbx
-       #                               c = x10 + x14
-       lea     (%rbp,%rbx),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x2 ^= c
-       xor     %r15,%rcx
-       #                               c = x14 + x2
-       lea     (%rbx,%rcx),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x6 ^= c
-       xor     %r15,%rax
-       #                               c = x2 + x6
-       lea     (%rcx,%rax),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x11 + x15
-       lea     (%r12,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x3 ^= d
-       xor     %rbp,%rsi
-       #                                               d = x15 + x3
-       lea     (%r15,%rsi),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x7 ^= d
-       xor     %rbp,%r8
-       #                                               d = x3 + x7
-       lea     (%rsi,%r8),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x11 ^= d
-       xor     %rbp,%r12
-       #                                               d = x7 + x11
-       lea     (%r8,%r12),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #               x5 = x5_stack
-       movq    160(%rsp),%r15
-       # a = x3 + x0
-       lea     (%rsi,%rdx),%rbp
-       # (uint32) a <<<= 7
-       rol     $7,%ebp
-       # x1 ^= a
-       xor     %rbp,%rdi
-       #               b = x4 + x5
-       lea     (%r9,%r15),%rbp
-       #               (uint32) b <<<= 7
-       rol     $7,%ebp
-       #               x6 ^= b
-       xor     %rbp,%rax
-       # a = x0 + x1
-       lea     (%rdx,%rdi),%rbp
-       # (uint32) a <<<= 9
-       rol     $9,%ebp
-       # x2 ^= a
-       xor     %rbp,%rcx
-       #               b = x5 + x6
-       lea     (%r15,%rax),%rbp
-       #               (uint32) b <<<= 9
-       rol     $9,%ebp
-       #               x7 ^= b
-       xor     %rbp,%r8
-       # a = x1 + x2
-       lea     (%rdi,%rcx),%rbp
-       # (uint32) a <<<= 13
-       rol     $13,%ebp
-       # x3 ^= a
-       xor     %rbp,%rsi
-       #               b = x6 + x7
-       lea     (%rax,%r8),%rbp
-       #               (uint32) b <<<= 13
-       rol     $13,%ebp
-       #               x4 ^= b
-       xor     %rbp,%r9
-       # a = x2 + x3
-       lea     (%rcx,%rsi),%rbp
-       # (uint32) a <<<= 18
-       rol     $18,%ebp
-       # x0 ^= a
-       xor     %rbp,%rdx
-       #               b = x7 + x4
-       lea     (%r8,%r9),%rbp
-       #               (uint32) b <<<= 18
-       rol     $18,%ebp
-       #               x5 ^= b
-       xor     %rbp,%r15
-       #                               x10 = x10_stack
-       movq    168(%rsp),%rbp
-       #               x5_stack = x5
-       movq    %r15,160(%rsp)
-       #                               c = x9 + x10
-       lea     (%r10,%rbp),%r15
-       #                               (uint32) c <<<= 7
-       rol     $7,%r15d
-       #                               x11 ^= c
-       xor     %r15,%r12
-       #                               c = x10 + x11
-       lea     (%rbp,%r12),%r15
-       #                               (uint32) c <<<= 9
-       rol     $9,%r15d
-       #                               x8 ^= c
-       xor     %r15,%r11
-       #                               c = x11 + x8
-       lea     (%r12,%r11),%r15
-       #                               (uint32) c <<<= 13
-       rol     $13,%r15d
-       #                               x9 ^= c
-       xor     %r15,%r10
-       #                               c = x8 + x9
-       lea     (%r11,%r10),%r15
-       #                               (uint32) c <<<= 18
-       rol     $18,%r15d
-       #                               x10 ^= c
-       xor     %r15,%rbp
-       #                                               x15 = x15_stack
-       movq    176(%rsp),%r15
-       #                               x10_stack = x10
-       movq    %rbp,168(%rsp)
-       #                                               d = x14 + x15
-       lea     (%rbx,%r15),%rbp
-       #                                               (uint32) d <<<= 7
-       rol     $7,%ebp
-       #                                               x12 ^= d
-       xor     %rbp,%r14
-       #                                               d = x15 + x12
-       lea     (%r15,%r14),%rbp
-       #                                               (uint32) d <<<= 9
-       rol     $9,%ebp
-       #                                               x13 ^= d
-       xor     %rbp,%r13
-       #                                               d = x12 + x13
-       lea     (%r14,%r13),%rbp
-       #                                               (uint32) d <<<= 13
-       rol     $13,%ebp
-       #                                               x14 ^= d
-       xor     %rbp,%rbx
-       #                                               d = x13 + x14
-       lea     (%r13,%rbx),%rbp
-       #                                               (uint32) d <<<= 18
-       rol     $18,%ebp
-       #                                               x15 ^= d
-       xor     %rbp,%r15
-       #                                               x15_stack = x15
-       movq    %r15,176(%rsp)
-       #   i = i_backup
-       movq    184(%rsp),%r15
-       #                  unsigned>? i -= 4
-       sub     $4,%r15
-       # comment:fp stack unchanged by jump
-       # goto mainloop if unsigned>
-       ja      ._mainloop
-       #   (uint32) x2 += j2
-       addl    64(%rsp),%ecx
-       #   x3 <<= 32
-       shl     $32,%rsi
-       #   x3 += j2
-       addq    64(%rsp),%rsi
-       #   (uint64) x3 >>= 32
-       shr     $32,%rsi
-       #   x3 <<= 32
-       shl     $32,%rsi
-       #   x2 += x3
-       add     %rsi,%rcx
-       #   (uint32) x6 += j6
-       addl    80(%rsp),%eax
-       #   x7 <<= 32
-       shl     $32,%r8
-       #   x7 += j6
-       addq    80(%rsp),%r8
-       #   (uint64) x7 >>= 32
-       shr     $32,%r8
-       #   x7 <<= 32
-       shl     $32,%r8
-       #   x6 += x7
-       add     %r8,%rax
-       #   (uint32) x8 += j8
-       addl    88(%rsp),%r11d
-       #   x9 <<= 32
-       shl     $32,%r10
-       #   x9 += j8
-       addq    88(%rsp),%r10
-       #   (uint64) x9 >>= 32
-       shr     $32,%r10
-       #   x9 <<= 32
-       shl     $32,%r10
-       #   x8 += x9
-       add     %r10,%r11
-       #   (uint32) x12 += j12
-       addl    104(%rsp),%r14d
-       #   x13 <<= 32
-       shl     $32,%r13
-       #   x13 += j12
-       addq    104(%rsp),%r13
-       #   (uint64) x13 >>= 32
-       shr     $32,%r13
-       #   x13 <<= 32
-       shl     $32,%r13
-       #   x12 += x13
-       add     %r13,%r14
-       #   (uint32) x0 += j0
-       addl    56(%rsp),%edx
-       #   x1 <<= 32
-       shl     $32,%rdi
-       #   x1 += j0
-       addq    56(%rsp),%rdi
-       #   (uint64) x1 >>= 32
-       shr     $32,%rdi
-       #   x1 <<= 32
-       shl     $32,%rdi
-       #   x0 += x1
-       add     %rdi,%rdx
-       #   x5 = x5_stack
-       movq    160(%rsp),%rdi
-       #   (uint32) x4 += j4
-       addl    72(%rsp),%r9d
-       #   x5 <<= 32
-       shl     $32,%rdi
-       #   x5 += j4
-       addq    72(%rsp),%rdi
-       #   (uint64) x5 >>= 32
-       shr     $32,%rdi
-       #   x5 <<= 32
-       shl     $32,%rdi
-       #   x4 += x5
-       add     %rdi,%r9
-       #   x10 = x10_stack
-       movq    168(%rsp),%r8
-       #   (uint32) x10 += j10
-       addl    96(%rsp),%r8d
-       #   x11 <<= 32
-       shl     $32,%r12
-       #   x11 += j10
-       addq    96(%rsp),%r12
-       #   (uint64) x11 >>= 32
-       shr     $32,%r12
-       #   x11 <<= 32
-       shl     $32,%r12
-       #   x10 += x11
-       add     %r12,%r8
-       #   x15 = x15_stack
-       movq    176(%rsp),%rdi
-       #   (uint32) x14 += j14
-       addl    112(%rsp),%ebx
-       #   x15 <<= 32
-       shl     $32,%rdi
-       #   x15 += j14
-       addq    112(%rsp),%rdi
-       #   (uint64) x15 >>= 32
-       shr     $32,%rdi
-       #   x15 <<= 32
-       shl     $32,%rdi
-       #   x14 += x15
-       add     %rdi,%rbx
-       #   out = out_backup
-       movq    136(%rsp),%rdi
-       #   m = m_backup
-       movq    144(%rsp),%rsi
-       #   x0 ^= *(uint64 *) (m + 0)
-       xorq    0(%rsi),%rdx
-       #   *(uint64 *) (out + 0) = x0
-       movq    %rdx,0(%rdi)
-       #   x2 ^= *(uint64 *) (m + 8)
-       xorq    8(%rsi),%rcx
-       #   *(uint64 *) (out + 8) = x2
-       movq    %rcx,8(%rdi)
-       #   x4 ^= *(uint64 *) (m + 16)
-       xorq    16(%rsi),%r9
-       #   *(uint64 *) (out + 16) = x4
-       movq    %r9,16(%rdi)
-       #   x6 ^= *(uint64 *) (m + 24)
-       xorq    24(%rsi),%rax
-       #   *(uint64 *) (out + 24) = x6
-       movq    %rax,24(%rdi)
-       #   x8 ^= *(uint64 *) (m + 32)
-       xorq    32(%rsi),%r11
-       #   *(uint64 *) (out + 32) = x8
-       movq    %r11,32(%rdi)
-       #   x10 ^= *(uint64 *) (m + 40)
-       xorq    40(%rsi),%r8
-       #   *(uint64 *) (out + 40) = x10
-       movq    %r8,40(%rdi)
-       #   x12 ^= *(uint64 *) (m + 48)
-       xorq    48(%rsi),%r14
-       #   *(uint64 *) (out + 48) = x12
-       movq    %r14,48(%rdi)
-       #   x14 ^= *(uint64 *) (m + 56)
-       xorq    56(%rsi),%rbx
-       #   *(uint64 *) (out + 56) = x14
-       movq    %rbx,56(%rdi)
-       #   bytes = bytes_backup
-       movq    152(%rsp),%rdx
-       #   in8 = j8
-       movq    88(%rsp),%rcx
-       #   in8 += 1
-       add     $1,%rcx
-       #   j8 = in8
-       movq    %rcx,88(%rsp)
-       #                          unsigned>? unsigned<? bytes - 64
-       cmp     $64,%rdx
-       # comment:fp stack unchanged by jump
-       #   goto bytesatleast65 if unsigned>
-       ja      ._bytesatleast65
-       # comment:fp stack unchanged by jump
-       #     goto bytesatleast64 if !unsigned<
-       jae     ._bytesatleast64
-       #       m = out
-       mov     %rdi,%rsi
-       #       out = ctarget
-       movq    128(%rsp),%rdi
-       #       i = bytes
-       mov     %rdx,%rcx
-       #       while (i) { *out++ = *m++; --i }
-       rep     movsb
-       # comment:fp stack unchanged by fallthrough
-#     bytesatleast64:
-._bytesatleast64:
-       #     x = x_backup
-       movq    120(%rsp),%rdi
-       #     in8 = j8
-       movq    88(%rsp),%rsi
-       #     *(uint64 *) (x + 32) = in8
-       movq    %rsi,32(%rdi)
-       #     r11 = r11_stack
-       movq    0(%rsp),%r11
-       #     r12 = r12_stack
-       movq    8(%rsp),%r12
-       #     r13 = r13_stack
-       movq    16(%rsp),%r13
-       #     r14 = r14_stack
-       movq    24(%rsp),%r14
-       #     r15 = r15_stack
-       movq    32(%rsp),%r15
-       #     rbx = rbx_stack
-       movq    40(%rsp),%rbx
-       #     rbp = rbp_stack
-       movq    48(%rsp),%rbp
-       # comment:fp stack unchanged by fallthrough
-#     done:
-._done:
-       #     leave
-       add     %r11,%rsp
-       mov     %rdi,%rax
-       mov     %rsi,%rdx
-       ret
-#   bytesatleast65:
-._bytesatleast65:
-       #   bytes -= 64
-       sub     $64,%rdx
-       #   out += 64
-       add     $64,%rdi
-       #   m += 64
-       add     $64,%rsi
-       # comment:fp stack unchanged by jump
-       # goto bytesatleast1
-       jmp     ._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
-       mov     %rsp,%r11
-       and     $31,%r11
-       add     $256,%r11
-       sub     %r11,%rsp
-       #   k = arg2
-       mov     %rsi,%rsi
-       #   kbits = arg3
-       mov     %rdx,%rdx
-       #   x = arg1
-       mov     %rdi,%rdi
-       #   in0 = *(uint64 *) (k + 0)
-       movq    0(%rsi),%r8
-       #   in2 = *(uint64 *) (k + 8)
-       movq    8(%rsi),%r9
-       #   *(uint64 *) (x + 4) = in0
-       movq    %r8,4(%rdi)
-       #   *(uint64 *) (x + 12) = in2
-       movq    %r9,12(%rdi)
-       #                    unsigned<? kbits - 256
-       cmp     $256,%rdx
-       # comment:fp stack unchanged by jump
-       #   goto kbits128 if unsigned<
-       jb      ._kbits128
-#   kbits256:
-._kbits256:
-       #     in10 = *(uint64 *) (k + 16)
-       movq    16(%rsi),%rdx
-       #     in12 = *(uint64 *) (k + 24)
-       movq    24(%rsi),%rsi
-       #     *(uint64 *) (x + 44) = in10
-       movq    %rdx,44(%rdi)
-       #     *(uint64 *) (x + 52) = in12
-       movq    %rsi,52(%rdi)
-       #     in0 = 1634760805
-       mov     $1634760805,%rsi
-       #     in4 = 857760878
-       mov     $857760878,%rdx
-       #     in10 = 2036477234
-       mov     $2036477234,%rcx
-       #     in14 = 1797285236
-       mov     $1797285236,%r8
-       #     *(uint32 *) (x + 0) = in0
-       movl    %esi,0(%rdi)
-       #     *(uint32 *) (x + 20) = in4
-       movl    %edx,20(%rdi)
-       #     *(uint32 *) (x + 40) = in10
-       movl    %ecx,40(%rdi)
-       #     *(uint32 *) (x + 60) = in14
-       movl    %r8d,60(%rdi)
-       # comment:fp stack unchanged by jump
-       #   goto keysetupdone
-       jmp     ._keysetupdone
-#   kbits128:
-._kbits128:
-       #     in10 = *(uint64 *) (k + 0)
-       movq    0(%rsi),%rdx
-       #     in12 = *(uint64 *) (k + 8)
-       movq    8(%rsi),%rsi
-       #     *(uint64 *) (x + 44) = in10
-       movq    %rdx,44(%rdi)
-       #     *(uint64 *) (x + 52) = in12
-       movq    %rsi,52(%rdi)
-       #     in0 = 1634760805
-       mov     $1634760805,%rsi
-       #     in4 = 824206446
-       mov     $824206446,%rdx
-       #     in10 = 2036477238
-       mov     $2036477238,%rcx
-       #     in14 = 1797285236
-       mov     $1797285236,%r8
-       #     *(uint32 *) (x + 0) = in0
-       movl    %esi,0(%rdi)
-       #     *(uint32 *) (x + 20) = in4
-       movl    %edx,20(%rdi)
-       #     *(uint32 *) (x + 40) = in10
-       movl    %ecx,40(%rdi)
-       #     *(uint32 *) (x + 60) = in14
-       movl    %r8d,60(%rdi)
-#   keysetupdone:
-._keysetupdone:
-       # leave
-       add     %r11,%rsp
-       mov     %rdi,%rax
-       mov     %rsi,%rdx
-       ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
-       mov     %rsp,%r11
-       and     $31,%r11
-       add     $256,%r11
-       sub     %r11,%rsp
-       #   iv = arg2
-       mov     %rsi,%rsi
-       #   x = arg1
-       mov     %rdi,%rdi
-       #   in6 = *(uint64 *) (iv + 0)
-       movq    0(%rsi),%rsi
-       #   in8 = 0
-       mov     $0,%r8
-       #   *(uint64 *) (x + 24) = in6
-       movq    %rsi,24(%rdi)
-       #   *(uint64 *) (x + 32) = in8
-       movq    %r8,32(%rdi)
-       # leave
-       add     %r11,%rsp
-       mov     %rdi,%rax
-       mov     %rsi,%rdx
-       ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644 (file)
index cb91a64..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Glue code for optimized assembly version of  Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/algapi.h>
-#include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE        8U
-#define SALSA20_MIN_KEY_SIZE  16U
-#define SALSA20_MAX_KEY_SIZE  32U
-
-struct salsa20_ctx
-{
-       u32 input[16];
-};
-
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
-                                u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
-                                     const u8 *src, u8 *dst, u32 bytes);
-
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
-                 unsigned int keysize)
-{
-       struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
-       salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
-       return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
-                  struct scatterlist *dst, struct scatterlist *src,
-                  unsigned int nbytes)
-{
-       struct blkcipher_walk walk;
-       struct crypto_blkcipher *tfm = desc->tfm;
-       struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
-       int err;
-
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       err = blkcipher_walk_virt_block(desc, &walk, 64);
-
-       salsa20_ivsetup(ctx, walk.iv);
-
-       while (walk.nbytes >= 64) {
-               salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-                                     walk.dst.virt.addr,
-                                     walk.nbytes - (walk.nbytes % 64));
-               err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
-       }
-
-       if (walk.nbytes) {
-               salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-                                     walk.dst.virt.addr, walk.nbytes);
-               err = blkcipher_walk_done(desc, &walk, 0);
-       }
-
-       return err;
-}
-
-static struct crypto_alg alg = {
-       .cra_name           =   "salsa20",
-       .cra_driver_name    =   "salsa20-asm",
-       .cra_priority       =   200,
-       .cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
-       .cra_type           =   &crypto_blkcipher_type,
-       .cra_blocksize      =   1,
-       .cra_ctxsize        =   sizeof(struct salsa20_ctx),
-       .cra_alignmask      =   3,
-       .cra_module         =   THIS_MODULE,
-       .cra_u              =   {
-               .blkcipher = {
-                       .setkey         =   setkey,
-                       .encrypt        =   encrypt,
-                       .decrypt        =   encrypt,
-                       .min_keysize    =   SALSA20_MIN_KEY_SIZE,
-                       .max_keysize    =   SALSA20_MAX_KEY_SIZE,
-                       .ivsize         =   SALSA20_IV_SIZE,
-               }
-       }
-};
-
-static int __init init(void)
-{
-       return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
-       crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
index 42212b60a0eee4c14b53b5dc049721b1d2a79a81..5579eb88d460a8e604eba5a8564d0126b933ab9c 100644 (file)
@@ -1324,32 +1324,6 @@ config CRYPTO_SALSA20
          The Salsa20 stream cipher algorithm is designed by Daniel J.
          Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
-config CRYPTO_SALSA20_586
-       tristate "Salsa20 stream cipher algorithm (i586)"
-       depends on (X86 || UML_X86) && !64BIT
-       select CRYPTO_BLKCIPHER
-       help
-         Salsa20 stream cipher algorithm.
-
-         Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-         Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-         The Salsa20 stream cipher algorithm is designed by Daniel J.
-         Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
-config CRYPTO_SALSA20_X86_64
-       tristate "Salsa20 stream cipher algorithm (x86_64)"
-       depends on (X86 || UML_X86) && 64BIT
-       select CRYPTO_BLKCIPHER
-       help
-         Salsa20 stream cipher algorithm.
-
-         Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-         Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-         The Salsa20 stream cipher algorithm is designed by Daniel J.
-         Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
 config CRYPTO_CHACHA20
        tristate "ChaCha20 cipher algorithm"
        select CRYPTO_BLKCIPHER