x86/entry/64: Always run ptregs-using syscalls on the slow path
authorAndy Lutomirski <luto@kernel.org>
Thu, 28 Jan 2016 23:11:25 +0000 (15:11 -0800)
committerIngo Molnar <mingo@kernel.org>
Fri, 29 Jan 2016 08:46:38 +0000 (09:46 +0100)
64-bit syscalls currently have an optimization in which they are
called with partial pt_regs.  A small handful require full
pt_regs.

In the 32-bit and compat cases, I cleaned this up by forcing
full pt_regs for all syscalls.  The performance hit doesn't
really matter as the affected system calls are fundamentally
heavy and this is the 32-bit compat case.

I want to clean up the 64-bit case as well, but I don't want to
hurt fast path performance.  To do that, I want to force the
syscalls that use pt_regs onto the slow path.  This will enable
us to make slow path syscalls be real ABI-compliant C functions.

Use the new syscall entry qualification machinery for this.
'stub_clone' is now 'stub_clone/ptregs'.

The next patch will eliminate the stubs, and we'll just have
'sys_clone/ptregs'.

As of this patch, two-phase entry tracing is no longer used.  It
has served its purpose (namely a huge speedup on some workloads
prior to more general opportunistic SYSRET support), and once
the dust settles I'll send patches to back it out.

The implementation is heavily based on a patch from Brian Gerst:

  http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com

Originally-From: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/entry/entry_64.S
arch/x86/entry/syscall_64.c
arch/x86/entry/syscalls/syscall_64.tbl

index 9d34d3cfceb61c1073c507f5a0f90e939eb146a3..f1c8f150728ee80c6cffc10ce3471367aff163e1 100644 (file)
@@ -182,7 +182,15 @@ entry_SYSCALL_64_fastpath:
 #endif
        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
        movq    %r10, %rcx
+
+       /*
+        * This call instruction is handled specially in stub_ptregs_64.
+        * It might end up jumping to the slow path.  If it jumps, RAX is
+        * clobbered.
+        */
        call    *sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
        movq    %rax, RAX(%rsp)
 1:
 /*
@@ -235,25 +243,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
        /* Do syscall entry tracing */
 tracesys:
-       movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       call    syscall_trace_enter_phase1
-       test    %rax, %rax
-       jnz     tracesys_phase2                 /* if needed, run the slow path */
-       RESTORE_C_REGS_EXCEPT_RAX               /* else restore clobbered regs */
-       movq    ORIG_RAX(%rsp), %rax
-       jmp     entry_SYSCALL_64_fastpath       /* and return to the fast path */
-
-tracesys_phase2:
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       movq    %rax, %rdx
-       call    syscall_trace_enter_phase2
+       call    syscall_trace_enter
 
        /*
         * Reload registers from stack in case ptrace changed them.
-        * We don't reload %rax because syscall_trace_entry_phase2() returned
+        * We don't reload %rax because syscall_trace_enter() returned
         * the value it wants us to use in the table lookup.
         */
        RESTORE_C_REGS_EXCEPT_RAX
@@ -355,6 +351,38 @@ opportunistic_sysret_failed:
        jmp     restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
+ENTRY(stub_ptregs_64)
+       /*
+        * Syscalls marked as needing ptregs land here.
+        * If we are on the fast path, we need to save the extra regs.
+        * If we are on the slow path, the extra regs are already saved.
+        *
+        * RAX stores a pointer to the C function implementing the syscall.
+        */
+       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+       jne     1f
+
+       /* Called from fast path -- pop return address and jump to slow path */
+       popq    %rax
+       jmp     tracesys        /* called from fast path */
+
+1:
+       /* Called from C */
+       jmp     *%rax                           /* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+       leaq    \func(%rip), %rax
+       jmp     stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 
        .macro FORK_LIKE func
 ENTRY(stub_\func)
index a1d408772ae6846393f753544afb1ec16638f486..9dbc5abb6162fa20581069499667a8c49b254868 100644 (file)
@@ -6,11 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
index dc1040a50bdc21594317f9f9dc4c59196a25ae63..5de342a729d0b0b8efc19497edc0365aae2fcb01 100644 (file)
@@ -21,7 +21,7 @@
 12     common  brk                     sys_brk
 13     64      rt_sigaction            sys_rt_sigaction
 14     common  rt_sigprocmask          sys_rt_sigprocmask
-15     64      rt_sigreturn            stub_rt_sigreturn
+15     64      rt_sigreturn            stub_rt_sigreturn/ptregs
 16     64      ioctl                   sys_ioctl
 17     common  pread64                 sys_pread64
 18     common  pwrite64                sys_pwrite64
 53     common  socketpair              sys_socketpair
 54     64      setsockopt              sys_setsockopt
 55     64      getsockopt              sys_getsockopt
-56     common  clone                   stub_clone
-57     common  fork                    stub_fork
-58     common  vfork                   stub_vfork
-59     64      execve                  stub_execve
+56     common  clone                   stub_clone/ptregs
+57     common  fork                    stub_fork/ptregs
+58     common  vfork                   stub_vfork/ptregs
+59     64      execve                  stub_execve/ptregs
 60     common  exit                    sys_exit
 61     common  wait4                   sys_wait4
 62     common  kill                    sys_kill
 319    common  memfd_create            sys_memfd_create
 320    common  kexec_file_load         sys_kexec_file_load
 321    common  bpf                     sys_bpf
-322    64      execveat                stub_execveat
+322    64      execveat                stub_execveat/ptregs
 323    common  userfaultfd             sys_userfaultfd
 324    common  membarrier              sys_membarrier
 325    common  mlock2                  sys_mlock2
 517    x32     recvfrom                compat_sys_recvfrom
 518    x32     sendmsg                 compat_sys_sendmsg
 519    x32     recvmsg                 compat_sys_recvmsg
-520    x32     execve                  stub_x32_execve
+520    x32     execve                  stub_x32_execve/ptregs
 521    x32     ptrace                  compat_sys_ptrace
 522    x32     rt_sigpending           compat_sys_rt_sigpending
 523    x32     rt_sigtimedwait         compat_sys_rt_sigtimedwait
 542    x32     getsockopt              compat_sys_getsockopt
 543    x32     io_setup                compat_sys_io_setup
 544    x32     io_submit               compat_sys_io_submit
-545    x32     execveat                stub_x32_execveat
+545    x32     execveat                stub_x32_execveat/ptregs