From: Thomas Gleixner Date: Thu, 11 Oct 2007 09:17:08 +0000 (+0200) Subject: x86_64: move lib X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=185f3d38900f750a4566f87cde6a178f3595a115;p=GitHub%2FLineageOS%2FG12%2Fandroid_kernel_amlogic_linux-4.9.git x86_64: move lib Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 2d7d724a2a6a..329da276c6f1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -1,5 +1,5 @@ ifeq ($(CONFIG_X86_32),y) include ${srctree}/arch/x86/lib/Makefile_32 else -include ${srctree}/arch/x86_64/lib/Makefile_64 +include ${srctree}/arch/x86/lib/Makefile_64 endif diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64 new file mode 100644 index 000000000000..bbabad3c9335 --- /dev/null +++ b/arch/x86/lib/Makefile_64 @@ -0,0 +1,13 @@ +# +# Makefile for x86_64-specific library files. +# + +CFLAGS_csum-partial_64.o := -funroll-loops + +obj-y := io_64.o iomap_copy_64.o +obj-$(CONFIG_SMP) += msr-on-cpu.o + +lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ + usercopy_64.o getuser_64.o putuser_64.o \ + thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o +lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c new file mode 100644 index 000000000000..95b6d9639fba --- /dev/null +++ b/arch/x86/lib/bitops_64.c @@ -0,0 +1,175 @@ +#include + +#undef find_first_zero_bit +#undef find_next_zero_bit +#undef find_first_bit +#undef find_next_bit + +static inline long +__find_first_zero_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1, d2; + long res; + + /* + * We must test the size in words, not in bits, because + * otherwise incoming sizes in the range -63..-1 will not run + * any scasq instructions, and then the flags used by the je + * instruction will have whatever random value was in place + * before. Nobody should call us like that, but + * find_next_zero_bit() does when offset and size are at the + * same word and it fails to find a zero itself. + */ + size += 63; + size >>= 6; + if (!size) + return 0; + asm volatile( + " repe; scasq\n" + " je 1f\n" + " xorq -8(%%rdi),%%rax\n" + " subq $8,%%rdi\n" + " bsfq %%rax,%%rdx\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rdx" + :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) + :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), + [addr] "S" (addr) : "memory"); + /* + * Any register would do for [addr] above, but GCC tends to + * prefer rbx over rsi, even though rsi is readily available + * and doesn't have to be saved. + */ + return res; +} + +/** + * find_first_zero_bit - find the first zero bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first zero bit, not the number of the byte + * containing a bit. + */ +long find_first_zero_bit(const unsigned long * addr, unsigned long size) +{ + return __find_first_zero_bit (addr, size); +} + +/** + * find_next_zero_bit - find the first zero bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_zero_bit (const unsigned long * addr, long size, long offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0; + unsigned long res, bit = offset&63; + + if (bit) { + /* + * Look for zero in first word + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0" + : "=r" (set) + : "r" (~(*p >> bit)), "r"(64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No zero yet, search remaining full words for a zero + */ + res = __find_first_zero_bit (p, size - 64 * (p - addr)); + + return (offset + set + res); +} + +static inline long +__find_first_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1; + long res; + + /* + * We must test the size in words, not in bits, because + * otherwise incoming sizes in the range -63..-1 will not run + * any scasq instructions, and then the flags used by the jz + * instruction will have whatever random value was in place + * before. Nobody should call us like that, but + * find_next_bit() does when offset and size are at the same + * word and it fails to find a one itself. + */ + size += 63; + size >>= 6; + if (!size) + return 0; + asm volatile( + " repe; scasq\n" + " jz 1f\n" + " subq $8,%%rdi\n" + " bsfq (%%rdi),%%rax\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"0" (0ULL), "1" (size), "2" (addr), + [addr] "r" (addr) : "memory"); + return res; +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +long find_first_bit(const unsigned long * addr, unsigned long size) +{ + return __find_first_bit(addr,size); +} + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_bit(const unsigned long * addr, long size, long offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0, bit = offset & 63, res; + + if (bit) { + /* + * Look for nonzero in the first 64 bits: + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0\n\t" + : "=r" (set) + : "r" (*p >> bit), "r" (64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = __find_first_bit (p, size - 64 * (p - addr)); + return (offset + set + res); +} + +#include + +EXPORT_SYMBOL(find_next_bit); +EXPORT_SYMBOL(find_first_bit); +EXPORT_SYMBOL(find_first_zero_bit); +EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c new file mode 100644 index 000000000000..24676609a6ac --- /dev/null +++ b/arch/x86/lib/bitstr_64.c @@ -0,0 +1,28 @@ +#include +#include + +/* Find string of zero bits in a bitmap */ +unsigned long +find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) +{ + unsigned long n, end, i; + + again: + n = find_next_zero_bit(bitmap, nbits, start); + if (n == -1) + return -1; + + /* could test bitsliced, but it's hardly worth it */ + end = n+len; + if (end >= nbits) + return -1; + for (i = n+1; i < end; i++) { + if (test_bit(i, bitmap)) { + start = i+1; + goto again; + } + } + return n; +} + +EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S new file mode 100644 index 000000000000..9a10a78bb4a4 --- /dev/null +++ b/arch/x86/lib/clear_page_64.S @@ -0,0 +1,59 @@ +#include +#include + +/* + * Zero a page. + * rdi page + */ + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S new file mode 100644 index 000000000000..727a5d46d2fc --- /dev/null +++ b/arch/x86/lib/copy_page_64.S @@ -0,0 +1,119 @@ +/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ + +#include +#include + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + +/* Don't use streaming store because it's better when the target + ends up in cache. */ + +/* Could vary the prefetch distance based on SMP/UP */ + +ENTRY(copy_page) + CFI_STARTPROC + subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 + movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 + movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + CFI_RESTORE rbx + movq 1*8(%rsp),%r12 + CFI_RESTORE r12 + movq 2*8(%rsp),%r13 + CFI_RESTORE r13 + addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 + ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S new file mode 100644 index 000000000000..70bebd310408 --- /dev/null +++ b/arch/x86/lib/copy_user_64.S @@ -0,0 +1,354 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include +#include + +#define FIX_ALIGNMENT 1 + +#include +#include +#include +#include + + .macro ALTERNATIVE_JUMP feature,orig,alt +0: + .byte 0xe9 /* 32bit jump */ + .long \orig-1f /* by default jump to orig */ +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 0b + .quad 2b + .byte \feature /* when feature is set */ + .byte 5 + .byte 5 + .previous + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(copy_to_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +/* Standard copy_from_user with segment limit checking */ +ENTRY(copy_from_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rsi,%rcx + addq %rdx,%rcx + jc bad_from_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_from_user + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(copy_from_user) + + .section .fixup,"ax" + /* must zero dest */ +bad_from_user: + CFI_STARTPROC + movl %edx,%ecx + xorl %eax,%eax + rep + stosb +bad_to_user: + movl %edx,%eax + ret + CFI_ENDPROC +END(bad_from_user) + .previous + + +/* + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq + * + * Input: + * rdi destination + * rsi source + * rdx count + * ecx zero flag -- if true zero destination on error + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_generic_unrolled) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + xorl %eax,%eax /*zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movq %r11,(%rdi) +.Ld2: movq %r8,1*8(%rdi) +.Ld3: movq %r9,2*8(%rdi) +.Ld4: movq %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movq %r11,4*8(%rdi) +.Ld6: movq %r8,5*8(%rdi) +.Ld7: movq %r9,6*8(%rdi) +.Ld8: movq %r10,7*8(%rdi) + + decq %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movq %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + + + /* Some CPUs run faster using the string copy instructions. + This is also a lot simpler. Use them when possible. + Patch in jmps to this code instead of copying it fully + to avoid unwanted aliasing in the exception tables. */ + + /* rdi destination + * rsi source + * rdx count + * ecx zero flag + * + * Output: + * eax uncopied bytes or 0 if successfull. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + */ +ENTRY(copy_user_generic_string) + CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + jz 10f +1: rep + movsq + movl %edx,%ecx +2: rep + movsb +9: movl %ecx,%eax + ret + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax + ret + + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret + CFI_ENDPROC +END(copy_user_generic_c) + + .section __ex_table,"a" + .quad 1b,3b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b + .previous diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S new file mode 100644 index 000000000000..4620efb12f13 --- /dev/null +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -0,0 +1,217 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include +#include + +#define FIX_ALIGNMENT 1 + +#include +#include +#include +#include + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + * + * Input: + * rdi destination + * rsi source + * rdx count + * rcx zero flag when 1 zero on exception + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx /* save zero flag */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + + xorl %eax,%eax /* zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movnti %r11,(%rdi) +.Ld2: movnti %r8,1*8(%rdi) +.Ld3: movnti %r9,2*8(%rdi) +.Ld4: movnti %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movnti %r11,4*8(%rdi) +.Ld6: movnti %r8,5*8(%rdi) +.Ld7: movnti %r9,6*8(%rdi) +.Ld8: movnti %r10,7*8(%rdi) + + dec %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movnti %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE %rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) /* zero flag set? */ + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(__copy_user_nocache) + + diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S new file mode 100644 index 000000000000..f0dba36578ea --- /dev/null +++ b/arch/x86/lib/csum-copy_64.S @@ -0,0 +1,249 @@ +/* + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. No warranty for anything given at all. + */ +#include +#include +#include + +/* + * Checksum copy with exception handling. + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * destination is zeroed. + * + * Input + * rdi source + * rsi destination + * edx len (32bit) + * ecx sum (32bit) + * r8 src_err_ptr (int) + * r9 dst_err_ptr (int) + * + * Output + * eax 64bit sum. undefined in case of exception. + * + * Wrappers need to take care of valid exception sum and zeroing. + * They also should align source or destination to 8 bytes. + */ + + .macro source +10: + .section __ex_table,"a" + .align 8 + .quad 10b,.Lbad_source + .previous + .endm + + .macro dest +20: + .section __ex_table,"a" + .align 8 + .quad 20b,.Lbad_dest + .previous + .endm + + .macro ignore L=.Lignore +30: + .section __ex_table,"a" + .align 8 + .quad 30b,\L + .previous + .endm + + +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC + cmpl $3*64,%edx + jle .Lignore + +.Lignore: + subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 + movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 + movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 + movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 + movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 + + movq %r8,(%rsp) + movq %r9,1*8(%rsp) + + movl %ecx,%eax + movl %edx,%ecx + + xorl %r9d,%r9d + movq %rcx,%r12 + + shrq $6,%r12 + jz .Lhandle_tail /* < 64 */ + + clc + + /* main loop. clear in 64 byte blocks */ + /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ + /* r11: temp3, rdx: temp4, r12 loopcnt */ + /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + .p2align 4 +.Lloop: + source + movq (%rdi),%rbx + source + movq 8(%rdi),%r8 + source + movq 16(%rdi),%r11 + source + movq 24(%rdi),%rdx + + source + movq 32(%rdi),%r10 + source + movq 40(%rdi),%rbp + source + movq 48(%rdi),%r14 + source + movq 56(%rdi),%r13 + + ignore 2f + prefetcht0 5*64(%rdi) +2: + adcq %rbx,%rax + adcq %r8,%rax + adcq %r11,%rax + adcq %rdx,%rax + adcq %r10,%rax + adcq %rbp,%rax + adcq %r14,%rax + adcq %r13,%rax + + decl %r12d + + dest + movq %rbx,(%rsi) + dest + movq %r8,8(%rsi) + dest + movq %r11,16(%rsi) + dest + movq %rdx,24(%rsi) + + dest + movq %r10,32(%rsi) + dest + movq %rbp,40(%rsi) + dest + movq %r14,48(%rsi) + dest + movq %r13,56(%rsi) + +3: + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Lloop + + adcq %r9,%rax + + /* do last upto 56 bytes */ +.Lhandle_tail: + /* ecx: count */ + movl %ecx,%r10d + andl $63,%ecx + shrl $3,%ecx + jz .Lfold + clc + .p2align 4 +.Lloop_8: + source + movq (%rdi),%rbx + adcq %rbx,%rax + decl %ecx + dest + movq %rbx,(%rsi) + leaq 8(%rsi),%rsi /* preserve carry */ + leaq 8(%rdi),%rdi + jnz .Lloop_8 + adcq %r9,%rax /* add in carry */ + +.Lfold: + /* reduce checksum to 32bits */ + movl %eax,%ebx + shrq $32,%rax + addl %ebx,%eax + adcl %r9d,%eax + + /* do last upto 6 bytes */ +.Lhandle_7: + movl %r10d,%ecx + andl $7,%ecx + shrl $1,%ecx + jz .Lhandle_1 + movl $2,%edx + xorl %ebx,%ebx + clc + .p2align 4 +.Lloop_1: + source + movw (%rdi),%bx + adcl %ebx,%eax + decl %ecx + dest + movw %bx,(%rsi) + leaq 2(%rdi),%rdi + leaq 2(%rsi),%rsi + jnz .Lloop_1 + adcl %r9d,%eax /* add in carry */ + + /* handle last odd byte */ +.Lhandle_1: + testl $1,%r10d + jz .Lende + xorl %ebx,%ebx + source + movb (%rdi),%bl + dest + movb %bl,(%rsi) + addl %ebx,%eax + adcl %r9d,%eax /* carry */ + + CFI_REMEMBER_STATE +.Lende: + movq 2*8(%rsp),%rbx + CFI_RESTORE rbx + movq 3*8(%rsp),%r12 + CFI_RESTORE r12 + movq 4*8(%rsp),%r14 + CFI_RESTORE r14 + movq 5*8(%rsp),%r13 + CFI_RESTORE r13 + movq 6*8(%rsp),%rbp + CFI_RESTORE rbp + addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 + ret + CFI_RESTORE_STATE + + /* Exception handlers. Very simple, zeroing is done in the wrappers */ +.Lbad_source: + movq (%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + +.Lbad_dest: + movq 8(%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c new file mode 100644 index 000000000000..bc503f506903 --- /dev/null +++ b/arch/x86/lib/csum-partial_64.c @@ -0,0 +1,150 @@ +/* + * arch/x86_64/lib/csum-partial.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed. + */ + +#include +#include +#include + +static inline unsigned short from32to16(unsigned a) +{ + unsigned short b = a >> 16; + asm("addw %w2,%w0\n\t" + "adcw $0,%w0\n" + : "=r" (b) + : "0" (b), "r" (a)); + return b; +} + +/* + * Do a 64-bit checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. + * + * Things tried and found to not make it faster: + * Manual Prefetching + * Unrolling to an 128 bytes inner loop. + * Using interleaving with more registers to break the carry chains. + */ +static unsigned do_csum(const unsigned char *buff, unsigned len) +{ + unsigned odd, count; + unsigned long result = 0; + + if (unlikely(len == 0)) + return result; + odd = 1 & (unsigned long) buff; + if (unlikely(odd)) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + unsigned long zero; + unsigned count64; + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + + /* main loop using 64byte blocks */ + zero = 0; + count64 = count >> 3; + while (count64) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq %[zero],%[res]" + : [res] "=r" (result) + : [src] "r" (buff), [zero] "r" (zero), + "[res]" (result)); + buff += 64; + count64--; + } + + /* last upto 7 8byte blocks */ + count %= 8; + while (count) { + asm("addq %1,%0\n\t" + "adcq %2,%0\n" + : "=r" (result) + : "m" (*(unsigned long *)buff), + "r" (zero), "0" (result)); + --count; + buff += 8; + } + result = add32_with_carry(result>>32, + result&0xffffffff); + + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = add32_with_carry(result>>32, result & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return result; +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 64-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)add32_with_carry(do_csum(buff, len), + (__force u32)sum); +} + +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +__sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff,len,0)); +} +EXPORT_SYMBOL(ip_compute_csum); + diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c new file mode 100644 index 000000000000..fd42a4a095fc --- /dev/null +++ b/arch/x86/lib/csum-wrappers_64.c @@ -0,0 +1,135 @@ +/* Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v.2 + * + * Wrappers of assembly checksum functions for x86-64. + */ + +#include +#include + +/** + * csum_partial_copy_from_user - Copy and checksum from user space. + * @src: source address (user space) + * @dst: destination address + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad source address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum isum, int *errp) +{ + might_sleep(); + *errp = 0; + if (likely(access_ok(VERIFY_READ,src, len))) { + /* Why 6, not 7? To handle odd addresses aligned we + would need to do considerable complications to fix the + checksum which is defined as an 16bit accumulator. The + fix alignment code is primarily for performance + compatibility with 32bit and that will handle odd + addresses slowly too. */ + if (unlikely((unsigned long)src & 6)) { + while (((unsigned long)src & 6) && len >= 2) { + __u16 val16; + *errp = __get_user(val16, (const __u16 __user *)src); + if (*errp) + return isum; + *(__u16 *)dst = val16; + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); + src += 2; + dst += 2; + len -= 2; + } + } + isum = csum_partial_copy_generic((__force const void *)src, + dst, len, isum, errp, NULL); + if (likely(*errp == 0)) + return isum; + } + *errp = -EFAULT; + memset(dst,0,len); + return isum; +} + +EXPORT_SYMBOL(csum_partial_copy_from_user); + +/** + * csum_partial_copy_to_user - Copy and checksum to user space. + * @src: source address + * @dst: destination address (user space) + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad destination address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +__wsum +csum_partial_copy_to_user(const void *src, void __user *dst, + int len, __wsum isum, int *errp) +{ + might_sleep(); + if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { + *errp = -EFAULT; + return 0; + } + + if (unlikely((unsigned long)dst & 6)) { + while (((unsigned long)dst & 6) && len >= 2) { + __u16 val16 = *(__u16 *)src; + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); + *errp = __put_user(val16, (__u16 __user *)dst); + if (*errp) + return isum; + src += 2; + dst += 2; + len -= 2; + } + } + + *errp = 0; + return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); +} + +EXPORT_SYMBOL(csum_partial_copy_to_user); + +/** + * csum_partial_copy_nocheck - Copy and checksum. + * @src: source address + * @dst: destination address + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * + * Returns an 32bit unfolded checksum of the buffer. + */ +__wsum +csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) +{ + return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); +} +EXPORT_SYMBOL(csum_partial_copy_nocheck); + +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, __wsum sum) +{ + __u64 rest, sum64; + + rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + + (__force __u64)sum; + asm(" addq (%[saddr]),%[sum]\n" + " adcq 8(%[saddr]),%[sum]\n" + " adcq (%[daddr]),%[sum]\n" + " adcq 8(%[daddr]),%[sum]\n" + " adcq $0,%[sum]\n" + : [sum] "=r" (sum64) + : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); + return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); +} + +EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c new file mode 100644 index 000000000000..2dbebd308347 --- /dev/null +++ b/arch/x86/lib/delay_64.c @@ -0,0 +1,57 @@ +/* + * Precise Delay Loops for x86-64 + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares + * + * The __delay function must _NOT_ be inlined as its execution time + * depends wildly on alignment on many x86 processors. + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +#include +#endif + +int read_current_timer(unsigned long *timer_value) +{ + rdtscll(*timer_value); + return 0; +} + +void __delay(unsigned long loops) +{ + unsigned bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } + while((now-bclock) < loops); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S new file mode 100644 index 000000000000..5448876261f8 --- /dev/null +++ b/arch/x86/lib/getuser_64.S @@ -0,0 +1,109 @@ +/* + * __get_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ + +/* + * __get_user_X + * + * Inputs: %rcx contains the address. + * The register is modified, but all changes are undone + * before returning because the C code doesn't know about it. + * + * Outputs: %rax is error code (0 or -EFAULT) + * %rdx contains zero-extended value + * + * %r8 is destroyed. + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#include +#include +#include +#include +#include +#include + + .text +ENTRY(__get_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + cmpq threadinfo_addr_limit(%r8),%rcx + jae bad_get_user +1: movzb (%rcx),%edx + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__get_user_1) + +ENTRY(__get_user_2) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $1,%rcx + jc 20f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 20f + decq %rcx +2: movzwl (%rcx),%edx + xorl %eax,%eax + ret +20: decq %rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) + +ENTRY(__get_user_4) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $3,%rcx + jc 30f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 30f + subq $3,%rcx +3: movl (%rcx),%edx + xorl %eax,%eax + ret +30: subq $3,%rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) + +ENTRY(__get_user_8) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $7,%rcx + jc 40f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 40f + subq $7,%rcx +4: movq (%rcx),%rdx + xorl %eax,%eax + ret +40: subq $7,%rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) + +bad_get_user: + CFI_STARTPROC + xorl %edx,%edx + movq $(-EFAULT),%rax + ret + CFI_ENDPROC +END(bad_get_user) + +.section __ex_table,"a" + .quad 1b,bad_get_user + .quad 2b,bad_get_user + .quad 3b,bad_get_user + .quad 4b,bad_get_user +.previous diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c new file mode 100644 index 000000000000..87b4a4e18039 --- /dev/null +++ b/arch/x86/lib/io_64.c @@ -0,0 +1,23 @@ +#include +#include +#include + +void __memcpy_toio(unsigned long dst,const void*src,unsigned len) +{ + __inline_memcpy((void *) dst,src,len); +} +EXPORT_SYMBOL(__memcpy_toio); + +void __memcpy_fromio(void *dst,unsigned long src,unsigned len) +{ + __inline_memcpy(dst,(const void *) src,len); +} +EXPORT_SYMBOL(__memcpy_fromio); + +void memset_io(volatile void __iomem *a, int b, size_t c) +{ + /* XXX: memset can mangle the IO patterns quite a bit. + perhaps it would be better to use a dumb one */ + memset((void *)a,b,c); +} +EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S new file mode 100644 index 000000000000..05a95e713da8 --- /dev/null +++ b/arch/x86/lib/iomap_copy_64.S @@ -0,0 +1,30 @@ +/* + * Copyright 2006 PathScale, Inc. All Rights Reserved. + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include + +/* + * override generic version in lib/iomap_copy.c + */ +ENTRY(__iowrite32_copy) + CFI_STARTPROC + movl %edx,%ecx + rep movsd + ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S new file mode 100644 index 000000000000..c22981fa2f3a --- /dev/null +++ b/arch/x86/lib/memcpy_64.S @@ -0,0 +1,131 @@ +/* Copyright 2002 Andi Kleen */ + +#include +#include +#include + +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret +.Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + /* Replace only beginning, memcpy is used to apply alternatives, so it + * is silly to overwrite itself with nops - reboot is only outcome... */ + .byte 2b - 1b + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c new file mode 100644 index 000000000000..751ebae8ec42 --- /dev/null +++ b/arch/x86/lib/memmove_64.c @@ -0,0 +1,21 @@ +/* Normally compiler builtins are used, but sometimes the compiler calls out + of line code. Based on asm-i386/string.h. + */ +#define _STRING_C +#include +#include + +#undef memmove +void *memmove(void * dest,const void *src,size_t count) +{ + if (dest < src) { + return memcpy(dest,src,count); + } else { + char *p = (char *) dest + count; + char *s = (char *) src + count; + while (count--) + *--p = *--s; + } + return dest; +} +EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S new file mode 100644 index 000000000000..2c5948116bd2 --- /dev/null +++ b/arch/x86/lib/memset_64.S @@ -0,0 +1,133 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include +#include + +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment + CFI_REMEMBER_STATE +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + + CFI_RESTORE_STATE +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S new file mode 100644 index 000000000000..4989f5a8fa9b --- /dev/null +++ b/arch/x86/lib/putuser_64.S @@ -0,0 +1,106 @@ +/* + * __put_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ + +/* + * __put_user_X + * + * Inputs: %rcx contains the address + * %rdx contains new value + * + * Outputs: %rax is error code (0 or -EFAULT) + * + * %r8 is destroyed. + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#include +#include +#include +#include +#include +#include + + .text +ENTRY(__put_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + cmpq threadinfo_addr_limit(%r8),%rcx + jae bad_put_user +1: movb %dl,(%rcx) + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__put_user_1) + +ENTRY(__put_user_2) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $1,%rcx + jc 20f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 20f + decq %rcx +2: movw %dx,(%rcx) + xorl %eax,%eax + ret +20: decq %rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) + +ENTRY(__put_user_4) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $3,%rcx + jc 30f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 30f + subq $3,%rcx +3: movl %edx,(%rcx) + xorl %eax,%eax + ret +30: subq $3,%rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) + +ENTRY(__put_user_8) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $7,%rcx + jc 40f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 40f + subq $7,%rcx +4: movq %rdx,(%rcx) + xorl %eax,%eax + ret +40: subq $7,%rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) + +bad_put_user: + CFI_STARTPROC + movq $(-EFAULT),%rax + ret + CFI_ENDPROC +END(bad_put_user) + +.section __ex_table,"a" + .quad 1b,bad_put_user + .quad 2b,bad_put_user + .quad 3b,bad_put_user + .quad 4b,bad_put_user +.previous diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S new file mode 100644 index 000000000000..0cde1f807314 --- /dev/null +++ b/arch/x86/lib/rwlock_64.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include +#include +#include +#include + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S new file mode 100644 index 000000000000..55e586d352d3 --- /dev/null +++ b/arch/x86/lib/thunk_64.S @@ -0,0 +1,67 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + #include + #include + #include + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro thunk name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + call \func + jmp restore + CFI_ENDPROC + .endm + + /* rdi: arg1 ... normal C conventions. rax is passed from C. */ + .macro thunk_retrax name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + call \func + jmp restore_norax + CFI_ENDPROC + .endm + + + .section .sched.text +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM + thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed + thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed + thunk rwsem_wake_thunk,rwsem_wake + thunk rwsem_downgrade_thunk,rwsem_downgrade_wake +#endif + + thunk __down_failed,__down + thunk_retrax __down_failed_interruptible,__down_interruptible + thunk_retrax __down_failed_trylock,__down_trylock + thunk __up_wakeup,__up + +#ifdef CONFIG_TRACE_IRQFLAGS + thunk trace_hardirqs_on_thunk,trace_hardirqs_on + thunk trace_hardirqs_off_thunk,trace_hardirqs_off +#endif + + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ + CFI_STARTPROC + SAVE_ARGS +restore: + RESTORE_ARGS + ret + CFI_ENDPROC + + CFI_STARTPROC + SAVE_ARGS +restore_norax: + RESTORE_ARGS 1 + ret + CFI_ENDPROC diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c new file mode 100644 index 000000000000..893d43f838cc --- /dev/null +++ b/arch/x86/lib/usercopy_64.c @@ -0,0 +1,166 @@ +/* + * User address space access functions. + * + * Copyright 1997 Andi Kleen + * Copyright 1997 Linus Torvalds + * Copyright 2002 Andi Kleen + */ +#include +#include + +/* + * Copy a null terminated string from userspace. + */ + +#define __do_strncpy_from_user(dst,src,count,res) \ +do { \ + long __d0, __d1, __d2; \ + might_sleep(); \ + __asm__ __volatile__( \ + " testq %1,%1\n" \ + " jz 2f\n" \ + "0: lodsb\n" \ + " stosb\n" \ + " testb %%al,%%al\n" \ + " jz 1f\n" \ + " decq %1\n" \ + " jnz 0b\n" \ + "1: subq %1,%0\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movq %5,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 8\n" \ + " .quad 0b,3b\n" \ + ".previous" \ + : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ + "=&D" (__d2) \ + : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ + : "memory"); \ +} while (0) + +long +__strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res; + __do_strncpy_from_user(dst, src, count, res); + return res; +} +EXPORT_SYMBOL(__strncpy_from_user); + +long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res = -EFAULT; + if (access_ok(VERIFY_READ, src, 1)) + return __strncpy_from_user(dst, src, count); + return res; +} +EXPORT_SYMBOL(strncpy_from_user); + +/* + * Zero Userspace + */ + +unsigned long __clear_user(void __user *addr, unsigned long size) +{ + long __d0; + might_sleep(); + /* no memory constraint because it doesn't change any memory gcc knows + about */ + asm volatile( + " testq %[size8],%[size8]\n" + " jz 4f\n" + "0: movq %[zero],(%[dst])\n" + " addq %[eight],%[dst]\n" + " decl %%ecx ; jnz 0b\n" + "4: movq %[size1],%%rcx\n" + " testl %%ecx,%%ecx\n" + " jz 2f\n" + "1: movb %b[zero],(%[dst])\n" + " incq %[dst]\n" + " decl %%ecx ; jnz 1b\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: lea 0(%[size1],%[size8],8),%[size8]\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 0b,3b\n" + " .quad 1b,2b\n" + ".previous" + : [size8] "=c"(size), [dst] "=&D" (__d0) + : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), + [zero] "r" (0UL), [eight] "r" (8UL)); + return size; +} +EXPORT_SYMBOL(__clear_user); + +unsigned long clear_user(void __user *to, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + return __clear_user(to, n); + return n; +} +EXPORT_SYMBOL(clear_user); + +/* + * Return the size of a string (including the ending 0) + * + * Return 0 on exception, a value greater than N if too long + */ + +long __strnlen_user(const char __user *s, long n) +{ + long res = 0; + char c; + + while (1) { + if (res>n) + return n+1; + if (__get_user(c, s)) + return 0; + if (!c) + return res+1; + res++; + s++; + } +} +EXPORT_SYMBOL(__strnlen_user); + +long strnlen_user(const char __user *s, long n) +{ + if (!access_ok(VERIFY_READ, s, n)) + return 0; + return __strnlen_user(s, n); +} +EXPORT_SYMBOL(strnlen_user); + +long strlen_user(const char __user *s) +{ + long res = 0; + char c; + + for (;;) { + if (get_user(c, s)) + return 0; + if (!c) + return res+1; + res++; + s++; + } +} +EXPORT_SYMBOL(strlen_user); + +unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) +{ + if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { + return copy_user_generic((__force void *)to, (__force void *)from, len); + } + return len; +} +EXPORT_SYMBOL(copy_in_user); + diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 265484c17234..ae48f179d719 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -73,7 +73,7 @@ AFLAGS += -m64 head-y := arch/x86_64/kernel/head_64.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task_64.o -libs-y += arch/x86_64/lib/ +libs-y += arch/x86/lib/ core-y += arch/x86_64/kernel/ \ arch/x86_64/mm/ \ arch/x86/crypto/ \ diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile deleted file mode 100644 index 2d7d724a2a6a..000000000000 --- a/arch/x86_64/lib/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/lib/Makefile_32 -else -include ${srctree}/arch/x86_64/lib/Makefile_64 -endif diff --git a/arch/x86_64/lib/Makefile_64 b/arch/x86_64/lib/Makefile_64 deleted file mode 100644 index bbabad3c9335..000000000000 --- a/arch/x86_64/lib/Makefile_64 +++ /dev/null @@ -1,13 +0,0 @@ -# -# Makefile for x86_64-specific library files. -# - -CFLAGS_csum-partial_64.o := -funroll-loops - -obj-y := io_64.o iomap_copy_64.o -obj-$(CONFIG_SMP) += msr-on-cpu.o - -lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ - usercopy_64.o getuser_64.o putuser_64.o \ - thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o -lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86_64/lib/bitops_64.c b/arch/x86_64/lib/bitops_64.c deleted file mode 100644 index 95b6d9639fba..000000000000 --- a/arch/x86_64/lib/bitops_64.c +++ /dev/null @@ -1,175 +0,0 @@ -#include - -#undef find_first_zero_bit -#undef find_next_zero_bit -#undef find_first_bit -#undef find_next_bit - -static inline long -__find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1, d2; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the je - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_zero_bit() does when offset and size are at the - * same word and it fails to find a zero itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " je 1f\n" - " xorq -8(%%rdi),%%rax\n" - " subq $8,%%rdi\n" - " bsfq %%rax,%%rdx\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rdx" - :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) - :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), - [addr] "S" (addr) : "memory"); - /* - * Any register would do for [addr] above, but GCC tends to - * prefer rbx over rsi, even though rsi is readily available - * and doesn't have to be saved. - */ - return res; -} - -/** - * find_first_zero_bit - find the first zero bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first zero bit, not the number of the byte - * containing a bit. - */ -long find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_zero_bit (addr, size); -} - -/** - * find_next_zero_bit - find the first zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_zero_bit (const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0; - unsigned long res, bit = offset&63; - - if (bit) { - /* - * Look for zero in first word - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0" - : "=r" (set) - : "r" (~(*p >> bit)), "r"(64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No zero yet, search remaining full words for a zero - */ - res = __find_first_zero_bit (p, size - 64 * (p - addr)); - - return (offset + set + res); -} - -static inline long -__find_first_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the jz - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_bit() does when offset and size are at the same - * word and it fails to find a one itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " jz 1f\n" - " subq $8,%%rdi\n" - " bsfq (%%rdi),%%rax\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rax" - :"=a" (res), "=&c" (d0), "=&D" (d1) - :"0" (0ULL), "1" (size), "2" (addr), - [addr] "r" (addr) : "memory"); - return res; -} - -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first set bit, not the number of the byte - * containing a bit. - */ -long find_first_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_bit(addr,size); -} - -/** - * find_next_bit - find the first set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_bit(const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0, bit = offset & 63, res; - - if (bit) { - /* - * Look for nonzero in the first 64 bits: - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0\n\t" - : "=r" (set) - : "r" (*p >> bit), "r" (64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = __find_first_bit (p, size - 64 * (p - addr)); - return (offset + set + res); -} - -#include - -EXPORT_SYMBOL(find_next_bit); -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86_64/lib/bitstr_64.c b/arch/x86_64/lib/bitstr_64.c deleted file mode 100644 index 24676609a6ac..000000000000 --- a/arch/x86_64/lib/bitstr_64.c +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include - -/* Find string of zero bits in a bitmap */ -unsigned long -find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) -{ - unsigned long n, end, i; - - again: - n = find_next_zero_bit(bitmap, nbits, start); - if (n == -1) - return -1; - - /* could test bitsliced, but it's hardly worth it */ - end = n+len; - if (end >= nbits) - return -1; - for (i = n+1; i < end; i++) { - if (test_bit(i, bitmap)) { - start = i+1; - goto again; - } - } - return n; -} - -EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86_64/lib/clear_page_64.S b/arch/x86_64/lib/clear_page_64.S deleted file mode 100644 index 9a10a78bb4a4..000000000000 --- a/arch/x86_64/lib/clear_page_64.S +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include - -/* - * Zero a page. - * rdi page - */ - ALIGN -clear_page_c: - CFI_STARTPROC - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - ret - CFI_ENDPROC -ENDPROC(clear_page) - -ENTRY(clear_page) - CFI_STARTPROC - xorl %eax,%eax - movl $4096/64,%ecx - .p2align 4 -.Lloop: - decl %ecx -#define PUT(x) movq %rax,x*8(%rdi) - movq %rax,(%rdi) - PUT(1) - PUT(2) - PUT(3) - PUT(4) - PUT(5) - PUT(6) - PUT(7) - leaq 64(%rdi),%rdi - jnz .Lloop - nop - ret - CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) - - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/copy_page_64.S b/arch/x86_64/lib/copy_page_64.S deleted file mode 100644 index 727a5d46d2fc..000000000000 --- a/arch/x86_64/lib/copy_page_64.S +++ /dev/null @@ -1,119 +0,0 @@ -/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ - -#include -#include - - ALIGN -copy_page_c: - CFI_STARTPROC - movl $4096/8,%ecx - rep movsq - ret - CFI_ENDPROC -ENDPROC(copy_page_c) - -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ - -ENTRY(copy_page) - CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 - movq %rbx,(%rsp) - CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 - - movl $(4096/64)-5,%ecx - .p2align 4 -.Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - prefetcht0 5*64(%rsi) - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi - - jnz .Loop64 - - movl $5,%ecx - .p2align 4 -.Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Loop2 - - movq (%rsp),%rbx - CFI_RESTORE rbx - movq 1*8(%rsp),%r12 - CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 - ret -.Lcopy_page_end: - CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/copy_user_64.S b/arch/x86_64/lib/copy_user_64.S deleted file mode 100644 index 70bebd310408..000000000000 --- a/arch/x86_64/lib/copy_user_64.S +++ /dev/null @@ -1,354 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include -#include - -#define FIX_ALIGNMENT 1 - -#include -#include -#include -#include - - .macro ALTERNATIVE_JUMP feature,orig,alt -0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ - .previous - .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .byte \feature /* when feature is set */ - .byte 5 - .byte 5 - .previous - .endm - -/* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) - CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user - xorl %eax,%eax /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(copy_user_generic) - CFI_STARTPROC - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - xorl %ecx,%ecx /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -/* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) - CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_from_user) - - .section .fixup,"ax" - /* must zero dest */ -bad_from_user: - CFI_STARTPROC - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret - CFI_ENDPROC -END(bad_from_user) - .previous - - -/* - * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro code for rep movsq - * - * Input: - * rdi destination - * rsi source - * rdx count - * ecx zero flag -- if true zero destination on error - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) - decl %ecx - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif - - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero - .previous - - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende - CFI_ENDPROC -ENDPROC(copy_user_generic) - - - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ - - /* rdi destination - * rsi source - * rdx count - * ecx zero flag - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ -ENTRY(copy_user_generic_string) - CFI_STARTPROC - movl %ecx,%r8d /* save zero flag */ - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - jz 10f -1: rep - movsq - movl %edx,%ecx -2: rep - movsb -9: movl %ecx,%eax - ret - - /* multiple of 8 byte */ -10: rep - movsq - xor %eax,%eax - ret - - /* exception handling */ -3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ - jmp 6f -5: movl %ecx,%eax /* exception on byte loop */ - /* eax: left over bytes */ -6: testl %r8d,%r8d /* zero flag set? */ - jz 7f - movl %eax,%ecx /* initialize x86 loop counter */ - push %rax - xorl %eax,%eax -8: rep - stosb /* zero the rest */ -11: pop %rax -7: ret - CFI_ENDPROC -END(copy_user_generic_c) - - .section __ex_table,"a" - .quad 1b,3b - .quad 2b,5b - .quad 8b,11b - .quad 10b,3b - .previous diff --git a/arch/x86_64/lib/copy_user_nocache_64.S b/arch/x86_64/lib/copy_user_nocache_64.S deleted file mode 100644 index 4620efb12f13..000000000000 --- a/arch/x86_64/lib/copy_user_nocache_64.S +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include -#include - -#define FIX_ALIGNMENT 1 - -#include -#include -#include -#include - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - * - * Input: - * rdi destination - * rsi source - * rdx count - * rcx zero flag when 1 zero on exception - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx /* save zero flag */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - - xorl %eax,%eax /* zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movnti %r11,(%rdi) -.Ld2: movnti %r8,1*8(%rdi) -.Ld3: movnti %r9,2*8(%rdi) -.Ld4: movnti %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movnti %r11,4*8(%rdi) -.Ld6: movnti %r8,5*8(%rdi) -.Ld7: movnti %r9,6*8(%rdi) -.Ld8: movnti %r10,7*8(%rdi) - - dec %rdx - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movnti %r8,(%rdi) - decl %ecx - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE %rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif - - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero - .previous - - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) /* zero flag set? */ - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende - CFI_ENDPROC -ENDPROC(__copy_user_nocache) - - diff --git a/arch/x86_64/lib/csum-copy_64.S b/arch/x86_64/lib/csum-copy_64.S deleted file mode 100644 index f0dba36578ea..000000000000 --- a/arch/x86_64/lib/csum-copy_64.S +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of this archive - * for more details. No warranty for anything given at all. - */ -#include -#include -#include - -/* - * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the - * destination is zeroed. - * - * Input - * rdi source - * rsi destination - * edx len (32bit) - * ecx sum (32bit) - * r8 src_err_ptr (int) - * r9 dst_err_ptr (int) - * - * Output - * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. - * They also should align source or destination to 8 bytes. - */ - - .macro source -10: - .section __ex_table,"a" - .align 8 - .quad 10b,.Lbad_source - .previous - .endm - - .macro dest -20: - .section __ex_table,"a" - .align 8 - .quad 20b,.Lbad_dest - .previous - .endm - - .macro ignore L=.Lignore -30: - .section __ex_table,"a" - .align 8 - .quad 30b,\L - .previous - .endm - - -ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore - -.Lignore: - subq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 - - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx - - xorl %r9d,%r9d - movq %rcx,%r12 - - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ - - clc - - /* main loop. clear in 64 byte blocks */ - /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ - /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ - .p2align 4 -.Lloop: - source - movq (%rdi),%rbx - source - movq 8(%rdi),%r8 - source - movq 16(%rdi),%r11 - source - movq 24(%rdi),%rdx - - source - movq 32(%rdi),%r10 - source - movq 40(%rdi),%rbp - source - movq 48(%rdi),%r14 - source - movq 56(%rdi),%r13 - - ignore 2f - prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax - - decl %r12d - - dest - movq %rbx,(%rsi) - dest - movq %r8,8(%rsi) - dest - movq %r11,16(%rsi) - dest - movq %rdx,24(%rsi) - - dest - movq %r10,32(%rsi) - dest - movq %rbp,40(%rsi) - dest - movq %r14,48(%rsi) - dest - movq %r13,56(%rsi) - -3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Lloop - - adcq %r9,%rax - - /* do last upto 56 bytes */ -.Lhandle_tail: - /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold - clc - .p2align 4 -.Lloop_8: - source - movq (%rdi),%rbx - adcq %rbx,%rax - decl %ecx - dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi - jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ - -.Lfold: - /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax - - /* do last upto 6 bytes */ -.Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx - jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc - .p2align 4 -.Lloop_1: - source - movw (%rdi),%bx - adcl %ebx,%eax - decl %ecx - dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi - jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - - /* handle last odd byte */ -.Lhandle_1: - testl $1,%r10d - jz .Lende - xorl %ebx,%ebx - source - movb (%rdi),%bl - dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - - CFI_REMEMBER_STATE -.Lende: - movq 2*8(%rsp),%rbx - CFI_RESTORE rbx - movq 3*8(%rsp),%r12 - CFI_RESTORE r12 - movq 4*8(%rsp),%r14 - CFI_RESTORE r14 - movq 5*8(%rsp),%r13 - CFI_RESTORE r13 - movq 6*8(%rsp),%rbp - CFI_RESTORE rbp - addq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET -7*8 - ret - CFI_RESTORE_STATE - - /* Exception handlers. Very simple, zeroing is done in the wrappers */ -.Lbad_source: - movq (%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - -.Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - CFI_ENDPROC -ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/csum-partial_64.c b/arch/x86_64/lib/csum-partial_64.c deleted file mode 100644 index bc503f506903..000000000000 --- a/arch/x86_64/lib/csum-partial_64.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * arch/x86_64/lib/csum-partial.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed. - */ - -#include -#include -#include - -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - -/* - * Do a 64-bit checksum on an arbitrary memory area. - * Returns a 32bit checksum. - * - * This isn't as time critical as it used to be because many NICs - * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. - */ -static unsigned do_csum(const unsigned char *buff, unsigned len) -{ - unsigned odd, count; - unsigned long result = 0; - - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } - - /* last upto 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); - - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); -} - -EXPORT_SYMBOL(csum_partial); - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -__sum16 ip_compute_csum(const void *buff, int len) -{ - return csum_fold(csum_partial(buff,len,0)); -} -EXPORT_SYMBOL(ip_compute_csum); - diff --git a/arch/x86_64/lib/csum-wrappers_64.c b/arch/x86_64/lib/csum-wrappers_64.c deleted file mode 100644 index fd42a4a095fc..000000000000 --- a/arch/x86_64/lib/csum-wrappers_64.c +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v.2 - * - * Wrappers of assembly checksum functions for x86-64. - */ - -#include -#include - -/** - * csum_partial_copy_from_user - Copy and checksum from user space. - * @src: source address (user space) - * @dst: destination address - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad source address. - * - * Returns an 32bit unfolded checksum of the buffer. - * src and dst are best aligned to 64bits. - */ -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum isum, int *errp) -{ - might_sleep(); - *errp = 0; - if (likely(access_ok(VERIFY_READ,src, len))) { - /* Why 6, not 7? To handle odd addresses aligned we - would need to do considerable complications to fix the - checksum which is defined as an 16bit accumulator. The - fix alignment code is primarily for performance - compatibility with 32bit and that will handle odd - addresses slowly too. */ - if (unlikely((unsigned long)src & 6)) { - while (((unsigned long)src & 6) && len >= 2) { - __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; - *(__u16 *)dst = val16; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - src += 2; - dst += 2; - len -= 2; - } - } - isum = csum_partial_copy_generic((__force const void *)src, - dst, len, isum, errp, NULL); - if (likely(*errp == 0)) - return isum; - } - *errp = -EFAULT; - memset(dst,0,len); - return isum; -} - -EXPORT_SYMBOL(csum_partial_copy_from_user); - -/** - * csum_partial_copy_to_user - Copy and checksum to user space. - * @src: source address - * @dst: destination address (user space) - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad destination address. - * - * Returns an 32bit unfolded checksum of the buffer. - * src and dst are best aligned to 64bits. - */ -__wsum -csum_partial_copy_to_user(const void *src, void __user *dst, - int len, __wsum isum, int *errp) -{ - might_sleep(); - if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { - *errp = -EFAULT; - return 0; - } - - if (unlikely((unsigned long)dst & 6)) { - while (((unsigned long)dst & 6) && len >= 2) { - __u16 val16 = *(__u16 *)src; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - *errp = __put_user(val16, (__u16 __user *)dst); - if (*errp) - return isum; - src += 2; - dst += 2; - len -= 2; - } - } - - *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); -} - -EXPORT_SYMBOL(csum_partial_copy_to_user); - -/** - * csum_partial_copy_nocheck - Copy and checksum. - * @src: source address - * @dst: destination address - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * - * Returns an 32bit unfolded checksum of the buffer. - */ -__wsum -csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) -{ - return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); -} -EXPORT_SYMBOL(csum_partial_copy_nocheck); - -__sum16 csum_ipv6_magic(const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u32 len, unsigned short proto, __wsum sum) -{ - __u64 rest, sum64; - - rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + - (__force __u64)sum; - asm(" addq (%[saddr]),%[sum]\n" - " adcq 8(%[saddr]),%[sum]\n" - " adcq (%[daddr]),%[sum]\n" - " adcq 8(%[daddr]),%[sum]\n" - " adcq $0,%[sum]\n" - : [sum] "=r" (sum64) - : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); - return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); -} - -EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86_64/lib/delay_64.c b/arch/x86_64/lib/delay_64.c deleted file mode 100644 index 2dbebd308347..000000000000 --- a/arch/x86_64/lib/delay_64.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Precise Delay Loops for x86-64 - * - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares - * - * The __delay function must _NOT_ be inlined as its execution time - * depends wildly on alignment on many x86 processors. - */ - -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP -#include -#endif - -int read_current_timer(unsigned long *timer_value) -{ - rdtscll(*timer_value); - return 0; -} - -void __delay(unsigned long loops) -{ - unsigned bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } - while((now-bclock) < loops); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86_64/lib/getuser_64.S b/arch/x86_64/lib/getuser_64.S deleted file mode 100644 index 5448876261f8..000000000000 --- a/arch/x86_64/lib/getuser_64.S +++ /dev/null @@ -1,109 +0,0 @@ -/* - * __get_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __get_user_X - * - * Inputs: %rcx contains the address. - * The register is modified, but all changes are undone - * before returning because the C code doesn't know about it. - * - * Outputs: %rax is error code (0 or -EFAULT) - * %rdx contains zero-extended value - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include -#include -#include -#include -#include -#include - - .text -ENTRY(__get_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_get_user -1: movzb (%rcx),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_1) - -ENTRY(__get_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movzwl (%rcx),%edx - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_2) - -ENTRY(__get_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl (%rcx),%edx - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_4) - -ENTRY(__get_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq (%rcx),%rdx - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_8) - -bad_get_user: - CFI_STARTPROC - xorl %edx,%edx - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_get_user) - -.section __ex_table,"a" - .quad 1b,bad_get_user - .quad 2b,bad_get_user - .quad 3b,bad_get_user - .quad 4b,bad_get_user -.previous diff --git a/arch/x86_64/lib/io_64.c b/arch/x86_64/lib/io_64.c deleted file mode 100644 index 87b4a4e18039..000000000000 --- a/arch/x86_64/lib/io_64.c +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include - -void __memcpy_toio(unsigned long dst,const void*src,unsigned len) -{ - __inline_memcpy((void *) dst,src,len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst,unsigned long src,unsigned len) -{ - __inline_memcpy(dst,(const void *) src,len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* XXX: memset can mangle the IO patterns quite a bit. - perhaps it would be better to use a dumb one */ - memset((void *)a,b,c); -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/x86_64/lib/iomap_copy_64.S b/arch/x86_64/lib/iomap_copy_64.S deleted file mode 100644 index 05a95e713da8..000000000000 --- a/arch/x86_64/lib/iomap_copy_64.S +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2006 PathScale, Inc. All Rights Reserved. - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include -#include - -/* - * override generic version in lib/iomap_copy.c - */ -ENTRY(__iowrite32_copy) - CFI_STARTPROC - movl %edx,%ecx - rep movsd - ret - CFI_ENDPROC -ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy_64.S b/arch/x86_64/lib/memcpy_64.S deleted file mode 100644 index c22981fa2f3a..000000000000 --- a/arch/x86_64/lib/memcpy_64.S +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright 2002 Andi Kleen */ - -#include -#include -#include - -/* - * memcpy - Copy a memory block. - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * rax original destination - */ - - ALIGN -memcpy_c: - CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep movsq - movl %edx,%ecx - rep movsb - ret - CFI_ENDPROC -ENDPROC(memcpy_c) - -ENTRY(__memcpy) -ENTRY(memcpy) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - -.Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret -.Lfinal: - CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ - .byte 2b - 1b - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/memmove_64.c b/arch/x86_64/lib/memmove_64.c deleted file mode 100644 index 751ebae8ec42..000000000000 --- a/arch/x86_64/lib/memmove_64.c +++ /dev/null @@ -1,21 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include -#include - -#undef memmove -void *memmove(void * dest,const void *src,size_t count) -{ - if (dest < src) { - return memcpy(dest,src,count); - } else { - char *p = (char *) dest + count; - char *s = (char *) src + count; - while (count--) - *--p = *--s; - } - return dest; -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86_64/lib/memset_64.S b/arch/x86_64/lib/memset_64.S deleted file mode 100644 index 2c5948116bd2..000000000000 --- a/arch/x86_64/lib/memset_64.S +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs */ - -#include -#include - -/* - * ISO C memset - set a memory block to a byte value. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ - ALIGN -memset_c: - CFI_STARTPROC - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep stosq - movl %r8d,%ecx - rep stosb - movq %r9,%rax - ret - CFI_ENDPROC -ENDPROC(memset_c) - -ENTRY(memset) -ENTRY(__memset) - CFI_STARTPROC - movq %rdi,%r10 - movq %rdx,%r11 - - /* expand byte value */ - movzbl %sil,%ecx - movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ - - /* align dst */ - movl %edi,%r9d - andl $7,%r9d - jnz .Lbad_alignment - CFI_REMEMBER_STATE -.Lafter_bad_alignment: - - movl %r11d,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) - leaq 64(%rdi),%rdi - jnz .Lloop_64 - - /* Handle tail in loops. The loops should be faster than hard - to predict jump tables. */ - .p2align 4 -.Lhandle_tail: - movl %r11d,%ecx - andl $63&(~7),%ecx - jz .Lhandle_7 - shrl $3,%ecx - .p2align 4 -.Lloop_8: - decl %ecx - movq %rax,(%rdi) - leaq 8(%rdi),%rdi - jnz .Lloop_8 - -.Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - decl %ecx - movb %al,(%rdi) - leaq 1(%rdi),%rdi - jnz .Lloop_1 - -.Lende: - movq %r10,%rax - ret - - CFI_RESTORE_STATE -.Lbad_alignment: - cmpq $7,%r11 - jbe .Lhandle_7 - movq %rax,(%rdi) /* unaligned store */ - movq $8,%r8 - subq %r9,%r8 - addq %r8,%rdi - subq %r8,%r11 - jmp .Lafter_bad_alignment -.Lfinal: - CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memset - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/msr-on-cpu.c b/arch/x86_64/lib/msr-on-cpu.c deleted file mode 100644 index 5672d4190fbe..000000000000 --- a/arch/x86_64/lib/msr-on-cpu.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../x86/lib/msr-on-cpu.c" diff --git a/arch/x86_64/lib/putuser_64.S b/arch/x86_64/lib/putuser_64.S deleted file mode 100644 index 4989f5a8fa9b..000000000000 --- a/arch/x86_64/lib/putuser_64.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * __put_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __put_user_X - * - * Inputs: %rcx contains the address - * %rdx contains new value - * - * Outputs: %rax is error code (0 or -EFAULT) - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include -#include -#include -#include -#include -#include - - .text -ENTRY(__put_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_put_user -1: movb %dl,(%rcx) - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__put_user_1) - -ENTRY(__put_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movw %dx,(%rcx) - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_2) - -ENTRY(__put_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl %edx,(%rcx) - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_4) - -ENTRY(__put_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq %rdx,(%rcx) - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_8) - -bad_put_user: - CFI_STARTPROC - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_put_user) - -.section __ex_table,"a" - .quad 1b,bad_put_user - .quad 2b,bad_put_user - .quad 3b,bad_put_user - .quad 4b,bad_put_user -.previous diff --git a/arch/x86_64/lib/rwlock_64.S b/arch/x86_64/lib/rwlock_64.S deleted file mode 100644 index 0cde1f807314..000000000000 --- a/arch/x86_64/lib/rwlock_64.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Slow paths of read/write spinlocks. */ - -#include -#include -#include -#include - -/* rdi: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - addl $RW_LOCK_BIAS,(%rdi) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rdi) - jne 1b - LOCK_PREFIX - subl $RW_LOCK_BIAS,(%rdi) - jnz __write_lock_failed - ret - CFI_ENDPROC -END(__write_lock_failed) - -/* rdi: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - incl (%rdi) -1: rep - nop - cmpl $1,(%rdi) - js 1b - LOCK_PREFIX - decl (%rdi) - js __read_lock_failed - ret - CFI_ENDPROC -END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk_64.S b/arch/x86_64/lib/thunk_64.S deleted file mode 100644 index 55e586d352d3..000000000000 --- a/arch/x86_64/lib/thunk_64.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - */ - - #include - #include - #include - #include - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro thunk name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore - CFI_ENDPROC - .endm - - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore_norax - CFI_ENDPROC - .endm - - - .section .sched.text -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up - -#ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off -#endif - - /* SAVE_ARGS below is used only for the .cfi directives it contains. */ - CFI_STARTPROC - SAVE_ARGS -restore: - RESTORE_ARGS - ret - CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 - ret - CFI_ENDPROC diff --git a/arch/x86_64/lib/usercopy_64.c b/arch/x86_64/lib/usercopy_64.c deleted file mode 100644 index 893d43f838cc..000000000000 --- a/arch/x86_64/lib/usercopy_64.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * User address space access functions. - * - * Copyright 1997 Andi Kleen - * Copyright 1997 Linus Torvalds - * Copyright 2002 Andi Kleen - */ -#include -#include - -/* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - long __d0, __d1, __d2; \ - might_sleep(); \ - __asm__ __volatile__( \ - " testq %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decq %1\n" \ - " jnz 0b\n" \ - "1: subq %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movq %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 0b,3b\n" \ - ".previous" \ - : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - return __strncpy_from_user(dst, src, count); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - -/* - * Zero Userspace - */ - -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_sleep(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - "0: movq %[zero],(%[dst])\n" - " addq %[eight],%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb %b[zero],(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: lea 0(%[size1],%[size8],8),%[size8]\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,3b\n" - " .quad 1b,2b\n" - ".previous" - : [size8] "=c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), - [zero] "r" (0UL), [eight] "r" (8UL)); - return size; -} -EXPORT_SYMBOL(__clear_user); - -unsigned long clear_user(void __user *to, unsigned long n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - return __clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); - -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ - -long __strnlen_user(const char __user *s, long n) -{ - long res = 0; - char c; - - while (1) { - if (res>n) - return n+1; - if (__get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(__strnlen_user); - -long strnlen_user(const char __user *s, long n) -{ - if (!access_ok(VERIFY_READ, s, n)) - return 0; - return __strnlen_user(s, n); -} -EXPORT_SYMBOL(strnlen_user); - -long strlen_user(const char __user *s) -{ - long res = 0; - char c; - - for (;;) { - if (get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(strlen_user); - -unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) -{ - if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { - return copy_user_generic((__force void *)to, (__force void *)from, len); - } - return len; -} -EXPORT_SYMBOL(copy_in_user); -