/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
* Output:
* rax original destination
- */
+ */
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
ALIGN
memcpy_c:
CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ shrl $3, %ecx
+ andl $7, %edx
rep movsq
- movl %edx,%ecx
+ movl %edx, %ecx
rep movsb
ret
CFI_ENDPROC
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $6,%ecx
+ /*
+ * Put the number of full 64-byte blocks into %ecx.
+ * Tail portion is handled at the end:
+ */
+ movq %rdi, %rax
+ movl %edx, %ecx
+ shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
+ /*
+ * We decrement the loop index here - and the zero-flag is
+ * checked at the end of the loop (instructions inbetween do
+ * not change the zero flag):
+ */
decl %ecx
- movq (%rsi),%r11
- movq 8(%rsi),%r8
+ /*
+ * Move in blocks of 4x16 bytes:
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r8, 1*8(%rdi)
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r10
+ movq %r9, 2*8(%rdi)
+ movq %r10, 3*8(%rdi)
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
+ movq 4*8(%rsi), %r11
+ movq 5*8(%rsi), %r8
+ movq %r11, 4*8(%rdi)
+ movq %r8, 5*8(%rdi)
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
+ movq 6*8(%rsi), %r9
+ movq 7*8(%rsi), %r10
+ movq %r9, 6*8(%rdi)
+ movq %r10, 7*8(%rdi)
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
+ movl %edx, %ecx
+ andl $63, %ecx
+ shrl $3, %ecx
jz .Lhandle_7
+
.p2align 4
.Lloop_8:
decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
jnz .Lloop_8
.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
+ movl %edx, %ecx
+ andl $7, %ecx
+ jz .Lend
+
.p2align 4
.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
+ movb (%rsi), %r8b
+ movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
-.Lende:
+.Lend:
ret
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs run faster using the string copy instructions.
+ * It is also a lot simpler. Use this when possible:
+ */
- .section .altinstr_replacement,"ax"
+ .section .altinstr_replacement, "ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
2:
.previous
- .section .altinstructions,"a"
+
+ .section .altinstructions, "a"
.align 8
.quad memcpy
.quad 1b
.byte X86_FEATURE_REP_GOOD
- /* Replace only beginning, memcpy is used to apply alternatives, so it
- * is silly to overwrite itself with nops - reboot is only outcome... */
+
+ /*
+ * Replace only beginning, memcpy is used to apply alternatives,
+ * so it is silly to overwrite itself with nops - reboot is the
+ * only outcome...
+ */
.byte 2b - 1b
.byte 2b - 1b
.previous