From: Thomas Gleixner Date: Thu, 11 Oct 2007 09:15:39 +0000 (+0200) Subject: x86_64: prepare shared lib/memcpy.S X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=ff8e90da7c3ccf2e74e7ab4a57a0ff6e0de027b0;p=GitHub%2FLineageOS%2FG12%2Fandroid_kernel_amlogic_linux-4.9.git x86_64: prepare shared lib/memcpy.S Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index d9d715d3ec9c..dce49348cebe 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -10,4 +10,4 @@ obj-$(CONFIG_SMP) += msr-on-cpu.o lib-y := csum-partial.o csum-copy_64.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock_64.o copy_user_nocache.o +lib-y += memcpy_64.o memmove.o memset.o copy_user.o rwlock_64.o copy_user_nocache.o diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S deleted file mode 100644 index c22981fa2f3a..000000000000 --- a/arch/x86_64/lib/memcpy.S +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright 2002 Andi Kleen */ - -#include -#include -#include - -/* - * memcpy - Copy a memory block. - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * rax original destination - */ - - ALIGN -memcpy_c: - CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep movsq - movl %edx,%ecx - rep movsb - ret - CFI_ENDPROC -ENDPROC(memcpy_c) - -ENTRY(__memcpy) -ENTRY(memcpy) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - -.Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret -.Lfinal: - CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ - .byte 2b - 1b - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/memcpy_64.S b/arch/x86_64/lib/memcpy_64.S new file mode 100644 index 000000000000..c22981fa2f3a --- /dev/null +++ b/arch/x86_64/lib/memcpy_64.S @@ -0,0 +1,131 @@ +/* Copyright 2002 Andi Kleen */ + +#include +#include +#include + +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret +.Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + /* Replace only beginning, memcpy is used to apply alternatives, so it + * is silly to overwrite itself with nops - reboot is only outcome... */ + .byte 2b - 1b + .byte 2b - 1b + .previous