x86/entry: Restore traditional SYSENTER calling convention
authorAndy Lutomirski <luto@kernel.org>
Thu, 17 Dec 2015 07:18:48 +0000 (23:18 -0800)
committerThomas Gleixner <tglx@linutronix.de>
Mon, 21 Dec 2015 15:05:01 +0000 (16:05 +0100)
It turns out that some Android versions hardcode the SYSENTER
calling convention.  This is buggy and will cause problems no
matter what the kernel does.  Nonetheless, we should try to
support it.

Credit goes to Linus for pointing out a clean way to handle
the SYSENTER/SYSCALL clobber differences while preserving
straightforward DWARF annotations.

I believe that the original offending Android commit was:

https://android.googlesource.com/platform%2Fbionic/+/7dc3684d7a2587e43e6d2a8e0e3f39bf759bd535

Reported-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-and-tested-by: Borislav Petkov <bp@alien8.de>
Cc: <mark.gross@intel.com>
Cc: Su Tao <tao.su@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: <frank.wang@intel.com>
Cc: <borun.fu@intel.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Mingwei Shi <mingwei.shi@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
arch/x86/entry/common.c
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64_compat.S
arch/x86/entry/vdso/vdso32/system_call.S

index a89fdbc1f0beb7e7198c7a625767a2cfe32ca9e3..03663740c86655cabf21504578e97d73d98595be 100644 (file)
@@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
        regs->ip = landing_pad;
 
        /*
-        * Fetch ECX from where the vDSO stashed it.
+        * Fetch EBP from where the vDSO stashed it.
         *
         * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
         */
@@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
                 * Micro-optimization: the pointer we're following is explicitly
                 * 32 bits, so it can't be out of range.
                 */
-               __get_user(*(u32 *)&regs->cx,
+               __get_user(*(u32 *)&regs->bp,
                            (u32 __user __force *)(unsigned long)(u32)regs->sp)
 #else
-               get_user(*(u32 *)&regs->cx,
+               get_user(*(u32 *)&regs->bp,
                         (u32 __user __force *)(unsigned long)(u32)regs->sp)
 #endif
                ) {
index fcad8ac30a8e776ee2f99285214d523262ef7751..f3b6d54e0042b7f08c25a82283f88e8193c70c4a 100644 (file)
@@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32)
        movl    TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
        pushl   $__USER_DS              /* pt_regs->ss */
-       pushl   %ecx                    /* pt_regs->sp (stashed in cx) */
+       pushl   %ebp                    /* pt_regs->sp (stashed in bp) */
        pushfl                          /* pt_regs->flags (except IF = 0) */
        orl     $X86_EFLAGS_IF, (%esp)  /* Fix IF */
        pushl   $__USER_CS              /* pt_regs->cs */
index 402e34a21559e3a6227a6e76b57029c8db74236d..6a1ae3751e824d9917e65c136e9f7de3cc5f4f47 100644 (file)
@@ -63,7 +63,7 @@ ENTRY(entry_SYSENTER_compat)
 
        /* Construct struct pt_regs on stack */
        pushq   $__USER32_DS            /* pt_regs->ss */
-       pushq   %rcx                    /* pt_regs->sp */
+       pushq   %rbp                    /* pt_regs->sp (stashed in bp) */
 
        /*
         * Push flags.  This is nasty.  First, interrupts are currently
@@ -82,14 +82,14 @@ ENTRY(entry_SYSENTER_compat)
        pushq   %rdi                    /* pt_regs->di */
        pushq   %rsi                    /* pt_regs->si */
        pushq   %rdx                    /* pt_regs->dx */
-       pushq   %rcx                    /* pt_regs->cx (will be overwritten) */
+       pushq   %rcx                    /* pt_regs->cx */
        pushq   $-ENOSYS                /* pt_regs->ax */
        pushq   %r8                     /* pt_regs->r8  = 0 */
        pushq   %r8                     /* pt_regs->r9  = 0 */
        pushq   %r8                     /* pt_regs->r10 = 0 */
        pushq   %r8                     /* pt_regs->r11 = 0 */
        pushq   %rbx                    /* pt_regs->rbx */
-       pushq   %rbp                    /* pt_regs->rbp */
+       pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
        pushq   %r8                     /* pt_regs->r12 = 0 */
        pushq   %r8                     /* pt_regs->r13 = 0 */
        pushq   %r8                     /* pt_regs->r14 = 0 */
@@ -179,7 +179,7 @@ ENTRY(entry_SYSCALL_compat)
        pushq   %rdi                    /* pt_regs->di */
        pushq   %rsi                    /* pt_regs->si */
        pushq   %rdx                    /* pt_regs->dx */
-       pushq   %rcx                    /* pt_regs->cx (will be overwritten) */
+       pushq   %rbp                    /* pt_regs->cx (stashed in bp) */
        pushq   $-ENOSYS                /* pt_regs->ax */
        xorq    %r8,%r8
        pushq   %r8                     /* pt_regs->r8  = 0 */
@@ -187,7 +187,7 @@ ENTRY(entry_SYSCALL_compat)
        pushq   %r8                     /* pt_regs->r10 = 0 */
        pushq   %r8                     /* pt_regs->r11 = 0 */
        pushq   %rbx                    /* pt_regs->rbx */
-       pushq   %rbp                    /* pt_regs->rbp */
+       pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
        pushq   %r8                     /* pt_regs->r12 = 0 */
        pushq   %r8                     /* pt_regs->r13 = 0 */
        pushq   %r8                     /* pt_regs->r14 = 0 */
index 8f42b1b9e8dfaef61d55ec19ded56522eb8ea455..3a1d9297074bc5e1d2e559735bb5247acffa6164 100644 (file)
@@ -21,35 +21,67 @@ __kernel_vsyscall:
        /*
         * Reshuffle regs so that all of any of the entry instructions
         * will preserve enough state.
+        *
+        * A really nice entry sequence would be:
+        *  pushl %edx
+        *  pushl %ecx
+        *  movl  %esp, %ecx
+        *
+        * Unfortunately, naughty Android versions between July and December
+        * 2015 actually hardcode the traditional Linux SYSENTER entry
+        * sequence.  That is severely broken for a number of reasons (ask
+        * anyone with an AMD CPU, for example).  Nonetheless, we try to keep
+        * it working approximately as well as it ever worked.
+        *
+        * This link may eludicate some of the history:
+        *   https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+        * personally, I find it hard to understand what's going on there.
+        *
+        * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+        * Execute an indirect call to the address in the AT_SYSINFO auxv
+        * entry.  That is the ONLY correct way to make a fast 32-bit system
+        * call on Linux.  (Open-coding int $0x80 is also fine, but it's
+        * slow.)
         */
+       pushl   %ecx
+       CFI_ADJUST_CFA_OFFSET   4
+       CFI_REL_OFFSET          ecx, 0
        pushl   %edx
        CFI_ADJUST_CFA_OFFSET   4
        CFI_REL_OFFSET          edx, 0
-       pushl   %ecx
+       pushl   %ebp
        CFI_ADJUST_CFA_OFFSET   4
-       CFI_REL_OFFSET          ecx, 0
-       movl    %esp, %ecx
+       CFI_REL_OFFSET          ebp, 0
+
+       #define SYSENTER_SEQUENCE       "movl %esp, %ebp; sysenter"
+       #define SYSCALL_SEQUENCE        "movl %ecx, %ebp; syscall"
 
 #ifdef CONFIG_X86_64
        /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
-       ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
-                         "syscall",  X86_FEATURE_SYSCALL32
+       ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+                         SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32
 #else
-       ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+       ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
 #endif
 
        /* Enter using int $0x80 */
-       movl    (%esp), %ecx
        int     $0x80
 GLOBAL(int80_landing_pad)
 
-       /* Restore ECX and EDX in case they were clobbered. */
-       popl    %ecx
-       CFI_RESTORE             ecx
+       /*
+        * Restore EDX and ECX in case they were clobbered.  EBP is not
+        * clobbered (the kernel restores it), but it's cleaner and
+        * probably faster to pop it than to adjust ESP using addl.
+        */
+       popl    %ebp
+       CFI_RESTORE             ebp
        CFI_ADJUST_CFA_OFFSET   -4
        popl    %edx
        CFI_RESTORE             edx
        CFI_ADJUST_CFA_OFFSET   -4
+       popl    %ecx
+       CFI_RESTORE             ecx
+       CFI_ADJUST_CFA_OFFSET   -4
        ret
        CFI_ENDPROC