x86/entry/64: Refactor IRQ stacks and make them NMI-safe
authorAndy Lutomirski <luto@kernel.org>
Tue, 11 Jul 2017 15:33:38 +0000 (10:33 -0500)
committerIngo Molnar <mingo@kernel.org>
Tue, 18 Jul 2017 08:56:22 +0000 (10:56 +0200)
This will allow IRQ stacks to nest inside NMIs or similar entries
that can happen during IRQ stack setup or teardown.

The new macros won't work correctly if they're invoked with IRQs on.
Add a check under CONFIG_DEBUG_ENTRY to detect that.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
[ Use %r10 instead of %r11 in xen_do_hypervisor_callback to make objtool
  and ORC unwinder's lives a little easier. ]
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/b0b2ff5fb97d2da2e1d7e1f380190c92545c8bb5.1499786555.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/Kconfig.debug
arch/x86/entry/entry_64.S
arch/x86/kernel/process_64.c

index fcb7604172cecb9879652a2b3f768bb9117fdf1a..353ed09f5fba7779fa038d055e6f20346e06c795 100644 (file)
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
          Some of these sanity checks may slow down kernel entries and
          exits or otherwise impact performance.
 
-         This is currently used to help test NMI code.
-
          If unsure, say N.
 
 config DEBUG_NMI_SELFTEST
index a9a8027a6c0eaa748541282448f41b9eec834d6c..0d4483ae6360e69e0e9552b58bc1aeaa68ed677a 100644 (file)
@@ -447,6 +447,59 @@ ENTRY(irq_entries_start)
     .endr
 END(irq_entries_start)
 
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+       pushfq
+       testl $X86_EFLAGS_IF, (%rsp)
+       jz .Lokay_\@
+       ud2
+.Lokay_\@:
+       addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it.  NMI-safe.  Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK old_rsp
+       DEBUG_ENTRY_ASSERT_IRQS_OFF
+       movq    %rsp, \old_rsp
+       incl    PER_CPU_VAR(irq_count)
+
+       /*
+        * Right now, if we just incremented irq_count to zero, we've
+        * claimed the IRQ stack but we haven't switched to it yet.
+        *
+        * If anything is added that can interrupt us here without using IST,
+        * it must be *extremely* careful to limit its stack usage.  This
+        * could include kprobes and a hypothetical future IST-less #DB
+        * handler.
+        */
+
+       cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
+       pushq   \old_rsp
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK
+       DEBUG_ENTRY_ASSERT_IRQS_OFF
+       /* We need to be off the IRQ stack before decrementing irq_count. */
+       popq    %rsp
+
+       /*
+        * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+        * the irq stack but we're not on it.
+        */
+
+       decl    PER_CPU_VAR(irq_count)
+.endm
+
 /*
  * Interrupt entry/exit.
  *
@@ -485,17 +538,7 @@ END(irq_entries_start)
        CALL_enter_from_user_mode
 
 1:
-       /*
-        * Save previous stack pointer, optionally switch to interrupt stack.
-        * irq_count is used to check if a CPU is already on an interrupt stack
-        * or not. While this is essentially redundant with preempt_count it is
-        * a little cheaper to use a separate counter in the PDA (short of
-        * moving irq_enter into assembly, which would be too much work)
-        */
-       movq    %rsp, %rdi
-       incl    PER_CPU_VAR(irq_count)
-       cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
-       pushq   %rdi
+       ENTER_IRQ_STACK old_rsp=%rdi
        /* We entered an interrupt context - irqs are off: */
        TRACE_IRQS_OFF
 
@@ -515,10 +558,8 @@ common_interrupt:
 ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
-       decl    PER_CPU_VAR(irq_count)
 
-       /* Restore saved previous stack */
-       popq    %rsp
+       LEAVE_IRQ_STACK
 
        testb   $3, CS(%rsp)
        jz      retint_kernel
@@ -891,12 +932,10 @@ bad_gs:
 ENTRY(do_softirq_own_stack)
        pushq   %rbp
        mov     %rsp, %rbp
-       incl    PER_CPU_VAR(irq_count)
-       cmove   PER_CPU_VAR(irq_stack_ptr), %rsp
-       push    %rbp                            /* frame pointer backlink */
+       ENTER_IRQ_STACK old_rsp=%r11
        call    __do_softirq
+       LEAVE_IRQ_STACK
        leaveq
-       decl    PER_CPU_VAR(irq_count)
        ret
 END(do_softirq_own_stack)
 
@@ -923,13 +962,11 @@ ENTRY(xen_do_hypervisor_callback)         /* do_hypervisor_callback(struct *pt_regs) */
  * see the correct pointer to the pt_regs
  */
        movq    %rdi, %rsp                      /* we don't return, adjust the stack frame */
-11:    incl    PER_CPU_VAR(irq_count)
-       movq    %rsp, %rbp
-       cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
-       pushq   %rbp                            /* frame pointer backlink */
+
+       ENTER_IRQ_STACK old_rsp=%r10
        call    xen_evtchn_do_upcall
-       popq    %rsp
-       decl    PER_CPU_VAR(irq_count)
+       LEAVE_IRQ_STACK
+
 #ifndef CONFIG_PREEMPT
        call    xen_maybe_preempt_hcall
 #endif
index c3169be4c5967de78efd75690a2bce80ca01506d..2987e3991c2b3fb28f9cdc5e082de33970d1ba32 100644 (file)
@@ -279,6 +279,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
        unsigned prev_fsindex, prev_gsindex;
 
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+                    this_cpu_read(irq_count) != -1);
+
        switch_fpu_prepare(prev_fpu, cpu);
 
        /* We must save %fs and %gs before load_TLS() because