#include <linux/linkage.h>
#include <asm/inst.h>
+#include <asm/frame.h>
/*
* The following macros are used to move an (un)aligned 16 byte value to/from
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
- movl 8(%esp), KEYP # ctx
- movl 12(%esp), UKEYP # in_key
- movl 16(%esp), %edx # key_len
+ movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+ movl (FRAME_OFFSET+16)(%esp), %edx # key_len
#endif
movups (UKEYP), %xmm0 # user key (first 16 bytes)
movaps %xmm0, (KEYP)
#ifndef __x86_64__
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_set_key)
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
popl KLEN
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_enc)
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
popl KLEN
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_dec)
* size_t len)
*/
ENTRY(aesni_ecb_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
popl KEYP
popl LEN
#endif
+ FRAME_END
ret
ENDPROC(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
#endif
test LEN, LEN
jz .Lecb_dec_ret
popl KEYP
popl LEN
#endif
+ FRAME_END
ret
ENDPROC(aesni_ecb_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
#endif
cmp $16, LEN
jb .Lcbc_enc_ret
popl LEN
popl IVP
#endif
+ FRAME_END
ret
ENDPROC(aesni_cbc_enc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
popl LEN
popl IVP
#endif
+ FRAME_END
ret
ENDPROC(aesni_cbc_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_ctr_enc)
+ FRAME_BEGIN
cmp $16, LEN
jb .Lctr_enc_just_ret
mov 480(KEYP), KLEN
.Lctr_enc_ret:
movups IV, (IVP)
.Lctr_enc_just_ret:
+ FRAME_END
ret
ENDPROC(aesni_ctr_enc)
* bool enc, u8 *iv)
*/
ENTRY(aesni_xts_crypt8)
+ FRAME_BEGIN
cmpb $0, %cl
movl $0, %ecx
movl $240, %r10d
pxor INC, STATE4
movdqu STATE4, 0x70(OUTP)
+ FRAME_END
ret
ENDPROC(aesni_xts_crypt8)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
* %xmm0..%xmm15: 16 encrypted blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 16(%rax), %rcx;
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+ FRAME_END
ret;
.align 8
* %xmm0..%xmm15: 16 plaintext blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 16(%rax), %rcx;
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+ FRAME_END
ret;
.align 8
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ecb_enc_16way)
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
cmpl $16, key_length(CTX);
movl $32, %r8d;
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ecb_dec_16way)
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
cmpl $16, key_length(CTX);
movl $32, %r8d;
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_cbc_dec_16way)
* %rdx: src (16 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
subq $(16 * 16), %rsp;
movq %rsp, %rax;
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ctr_16way)
* %r8: index for input whitening key
* %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
*/
+ FRAME_BEGIN
subq $(16 * 16), %rsp;
movq %rsp, %rax;
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_xts_crypt_16way)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
* %ymm0..%ymm15: 32 encrypted blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 32(%rax), %rcx;
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+ FRAME_END
ret;
.align 8
* %ymm0..%ymm15: 16 plaintext blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 32(%rax), %rcx;
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+ FRAME_END
ret;
.align 8
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ecb_enc_32way)
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ecb_dec_32way)
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_cbc_dec_32way)
* %rdx: src (32 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ctr_32way)
* %r8: index for input whitening key
* %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_xts_crypt_32way)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
.file "cast5-avx-x86_64-asm_64.S"
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ FRAME_END
ret;
ENDPROC(cast5_ecb_enc_16way)
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
vmovdqu (0*4*4)(%rdx), RL1;
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ FRAME_END
ret;
ENDPROC(cast5_ecb_dec_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast5_cbc_dec_16way)
* %rdx: src
* %rcx: iv (big endian, 64bit)
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast5_ctr_16way)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S"
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_ecb_enc_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_ecb_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast6_cbc_dec_8way)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast6_ctr_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_xts_enc_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_xts_dec_8way)
#include <linux/linkage.h>
#include <asm/inst.h>
+#include <asm/frame.h>
.data
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
ENTRY(clmul_ghash_mul)
+ FRAME_BEGIN
movups (%rdi), DATA
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
call __clmul_gf128mul_ble
PSHUFB_XMM BSWAP DATA
movups DATA, (%rdi)
+ FRAME_END
ret
ENDPROC(clmul_ghash_mul)
* const u128 *shash);
*/
ENTRY(clmul_ghash_update)
+ FRAME_BEGIN
cmp $16, %rdx
jb .Lupdate_just_ret # check length
movaps .Lbswap_mask, BSWAP
PSHUFB_XMM BSWAP DATA
movups DATA, (%rdi)
.Lupdate_just_ret:
+ FRAME_END
ret
ENDPROC(clmul_ghash_update)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "serpent-avx-x86_64-asm_64.S"
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_ecb_enc_8way_avx)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_ecb_dec_8way_avx)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_cbc_dec_8way_avx)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RK0, RK1, RK2);
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_ctr_8way_avx)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_xts_enc_8way_avx)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_xts_dec_8way_avx)
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx2.S"
.file "serpent-avx2-asm_64.S"
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ecb_enc_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ecb_dec_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_cbc_dec_16way)
* %rdx: src (16 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ctr_16way)
* %rdx: src (16 blocks)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_xts_enc_16way)
* %rdx: src (16 blocks)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
vzeroupper;
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_xts_dec_16way)
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
# arg 1 : rcx : state
ENTRY(sha1_mb_mgr_flush_avx2)
+ FRAME_BEGIN
push %rbx
# If bit (32+3) is set, then all lanes are empty
return:
pop %rbx
+ FRAME_END
ret
return_null:
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
# arg 1 : rcx : state
# arg 2 : rdx : job
ENTRY(sha1_mb_mgr_submit_avx2)
+ FRAME_BEGIN
push %rbx
push %r12
return:
pop %r12
pop %rbx
+ FRAME_END
ret
return_null:
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "twofish-avx-x86_64-asm_64.S"
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+ FRAME_END
ret;
ENDPROC(twofish_ecb_enc_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(twofish_ecb_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(twofish_cbc_dec_8way)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
pushq %r12;
popq %r12;
+ FRAME_END
ret;
ENDPROC(twofish_ctr_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+ FRAME_END
ret;
ENDPROC(twofish_xts_enc_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(twofish_xts_dec_8way)