[PATCH] sched: cleanup context switch locking
authorNick Piggin <nickpiggin@yahoo.com.au>
Sat, 25 Jun 2005 21:57:23 +0000 (14:57 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Sat, 25 Jun 2005 23:24:43 +0000 (16:24 -0700)
Instead of requiring architecture code to interact with the scheduler's
locking implementation, provide a couple of defines that can be used by the
architecture to request runqueue unlocked context switches, and ask for
interrupts to be enabled over the context switch.

Also replaces the "switch_lock" used by these architectures with an oncpu
flag (note, not a potentially slow bitflag).  This eliminates one bus
locked memory operation when context switching, and simplifies the
task_running function.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
include/asm-arm/system.h
include/asm-ia64/system.h
include/asm-mips/system.h
include/asm-s390/system.h
include/asm-sparc/system.h
include/asm-sparc64/system.h
include/linux/init_task.h
include/linux/sched.h
kernel/sched.c

index 39dd7008013c839a102bc49f53ba4ba5691ab864..3d0d2860b6db103c3969f60d4cf06975304032d8 100644 (file)
@@ -145,34 +145,12 @@ extern unsigned int user_debug;
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t");
 
-#ifdef CONFIG_SMP
 /*
- * Define our own context switch locking.  This allows us to enable
- * interrupts over the context switch, otherwise we end up with high
- * interrupt latency.  The real problem area is switch_mm() which may
- * do a full cache flush.
+ * switch_mm() may do a full cache flush over the context switch,
+ * so enable interrupts over the context switch to avoid high
+ * latency.
  */
-#define prepare_arch_switch(rq,next)                                   \
-do {                                                                   \
-       spin_lock(&(next)->switch_lock);                                \
-       spin_unlock_irq(&(rq)->lock);                                   \
-} while (0)
-
-#define finish_arch_switch(rq,prev)                                    \
-       spin_unlock(&(prev)->switch_lock)
-
-#define task_running(rq,p)                                             \
-       ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-#else
-/*
- * Our UP-case is more simple, but we assume knowledge of how
- * spin_unlock_irq() and friends are implemented.  This avoids
- * us needlessly decrementing and incrementing the preempt count.
- */
-#define prepare_arch_switch(rq,next)   local_irq_enable()
-#define finish_arch_switch(rq,prev)    spin_unlock(&(rq)->lock)
-#define task_running(rq,p)             ((rq)->curr == (p))
-#endif
+#define __ARCH_WANT_INTERRUPTS_ON_CTXSW
 
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
index 6f516e76d1f0cf0947d99fa6d429f01657343791..cd2cf76b2db1d77cb22a69f928f96c4a74d1c59c 100644 (file)
@@ -183,8 +183,6 @@ do {                                                                \
 
 #ifdef __KERNEL__
 
-#define prepare_to_switch()    do { } while(0)
-
 #ifdef CONFIG_IA32_SUPPORT
 # define IS_IA32_PROCESS(regs) (ia64_psr(regs)->is != 0)
 #else
@@ -274,13 +272,7 @@ extern void ia64_load_extra (struct task_struct *task);
  * of that CPU which will not be released, because there we wait for the
  * tasklist_lock to become available.
  */
-#define prepare_arch_switch(rq, next)          \
-do {                                           \
-       spin_lock(&(next)->switch_lock);        \
-       spin_unlock(&(rq)->lock);               \
-} while (0)
-#define finish_arch_switch(rq, prev)   spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p)            ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
index 888fd8908467210cc9344e3ce3050eabd28ee2de..169f3d4265b14fdeec80c30982572938bab81fc1 100644 (file)
@@ -422,16 +422,10 @@ extern void __die_if_kernel(const char *, struct pt_regs *, const char *file,
 extern int stop_a_enabled;
 
 /*
- * Taken from include/asm-ia64/system.h; prevents deadlock on SMP
+ * See include/asm-ia64/system.h; prevents deadlock on SMP
  * systems.
  */
-#define prepare_arch_switch(rq, next)          \
-do {                                           \
-       spin_lock(&(next)->switch_lock);        \
-       spin_unlock(&(rq)->lock);               \
-} while (0)
-#define finish_arch_switch(rq, prev)   spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p)            ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define arch_align_stack(x) (x)
 
index e3cb3ce1d24ae3d31a136e8d441602a560c78465..b4a9f05a93d6d74317aa1a3446e67088b2c2609f 100644 (file)
@@ -104,29 +104,18 @@ static inline void restore_access_regs(unsigned int *acrs)
        prev = __switch_to(prev,next);                                       \
 } while (0)
 
-#define prepare_arch_switch(rq, next)  do { } while(0)
-#define task_running(rq, p)            ((rq)->curr == (p))
-
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 extern void account_user_vtime(struct task_struct *);
 extern void account_system_vtime(struct task_struct *);
-
-#define finish_arch_switch(rq, prev) do {                                   \
-       set_fs(current->thread.mm_segment);                                  \
-       spin_unlock(&(rq)->lock);                                            \
-       account_system_vtime(prev);                                          \
-       local_irq_enable();                                                  \
-} while (0)
-
 #else
+#define account_system_vtime(prev) do { } while (0)
+#endif
 
 #define finish_arch_switch(rq, prev) do {                                   \
        set_fs(current->thread.mm_segment);                                  \
-       spin_unlock_irq(&(rq)->lock);                                        \
+       account_system_vtime(prev);                                          \
 } while (0)
 
-#endif
-
 #define nop() __asm__ __volatile__ ("nop")
 
 #define xchg(ptr,x) \
index 80cf20cfaee1cdf1d6213fbd5b9a516a73c9a9ff..898562ebe94c6d8e7abfbc75d8d9f22e0ffd8dc1 100644 (file)
@@ -101,7 +101,7 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
  * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
  * XXX WTF is the above comment? Found in late teen 2.4.x.
  */
-#define prepare_arch_switch(rq, next) do { \
+#define prepare_arch_switch(next) do { \
        __asm__ __volatile__( \
        ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
        "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
@@ -109,8 +109,6 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
        "save %sp, -0x40, %sp\n\t" \
        "restore; restore; restore; restore; restore; restore; restore"); \
 } while(0)
-#define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
-#define task_running(rq, p)            ((rq)->curr == (p))
 
        /* Much care has gone into this code, do not touch it.
         *
index fd12ca386f486047b141926aa41890f2d129d095..f9be2c5b4dc97360013fb5ddfe2d54c23aeaad90 100644 (file)
@@ -139,19 +139,13 @@ extern void __flushw_user(void);
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all
 
-#define prepare_arch_switch(rq, next)          \
-do {   spin_lock(&(next)->switch_lock);        \
-       spin_unlock(&(rq)->lock);               \
+/* Don't hold the runqueue lock over context switch */
+#define __ARCH_WANT_UNLOCKED_CTXSW
+#define prepare_arch_switch(next)              \
+do {                                           \
        flushw_all();                           \
 } while (0)
 
-#define finish_arch_switch(rq, prev)           \
-do {   spin_unlock_irq(&(prev)->switch_lock);  \
-} while (0)
-
-#define task_running(rq, p) \
-       ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-
        /* See what happens when you design the chip correctly?
         *
         * We tell gcc we clobber all non-fixed-usage registers except
index a6a8c1a38d5e4278a8472132d70b44a6589ae654..03206a425d7a3c2a187f702de734c98fe66df8b5 100644 (file)
@@ -108,7 +108,6 @@ extern struct group_info init_groups;
        .blocked        = {{0}},                                        \
        .alloc_lock     = SPIN_LOCK_UNLOCKED,                           \
        .proc_lock      = SPIN_LOCK_UNLOCKED,                           \
-       .switch_lock    = SPIN_LOCK_UNLOCKED,                           \
        .journal_info   = NULL,                                         \
        .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
 }
index 36a10781c3f3b77076df64eb42cfaa12caafbe45..d27be933742510e60669d84fb19a6401862158af 100644 (file)
@@ -368,6 +368,11 @@ struct signal_struct {
 #endif
 };
 
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
 /*
  * Bits in flags field of signal_struct.
  */
@@ -594,6 +599,9 @@ struct task_struct {
 
        int lock_depth;         /* BKL lock depth */
 
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       int oncpu;
+#endif
        int prio, static_prio;
        struct list_head run_list;
        prio_array_t *array;
@@ -716,8 +724,6 @@ struct task_struct {
        spinlock_t alloc_lock;
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
        spinlock_t proc_lock;
-/* context-switch lock */
-       spinlock_t switch_lock;
 
 /* journalling filesystem info */
        void *journal_info;
index 98bf1c091da59f0f32c9786e68f6afe88acf132f..b1410577f9a8a46903a5da1a04f34de6d16c24bc 100644 (file)
@@ -268,14 +268,71 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next)  spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)           ((rq)->curr == (p))
+# define prepare_arch_switch(next)     do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)      do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+       return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+       spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+       return p->oncpu;
+#else
+       return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       spin_unlock_irq(&rq->lock);
+#else
+       spin_unlock(&rq->lock);
 #endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->oncpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -1196,17 +1253,14 @@ void fastcall sched_fork(task_t *p)
        p->state = TASK_RUNNING;
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
-       spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-       /*
-        * During context-switch we hold precisely one spinlock, which
-        * schedule_tail drops. (in the common case it's this_rq()->lock,
-        * but it also can be p->switch_lock.) So we compensate with a count
-        * of 1. Also, we want to start with kernel preemption disabled.
-        */
+       /* Want to start with kernel preemption disabled. */
        p->thread_info->preempt_count = 1;
 #endif
        /*
@@ -1387,23 +1441,41 @@ void fastcall sched_exit(task_t * p)
        task_rq_unlock(rq, &flags);
 }
 
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+       prepare_lock_switch(rq, next);
+       prepare_arch_switch(next);
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
  *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
        __releases(rq->lock)
 {
-       runqueue_t *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
        unsigned long prev_task_flags;
 
@@ -1421,7 +1493,8 @@ static inline void finish_task_switch(task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
-       finish_arch_switch(rq, prev);
+       finish_arch_switch(prev);
+       finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_task_flags & PF_DEAD))
@@ -1435,8 +1508,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
        __releases(rq->lock)
 {
-       finish_task_switch(prev);
-
+       runqueue_t *rq = this_rq();
+       finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+       /* In this case, finish_task_switch does not reenable preemption */
+       preempt_enable();
+#endif
        if (current->set_child_tid)
                put_user(current->pid, current->set_child_tid);
 }
@@ -2816,11 +2893,15 @@ switch_tasks:
                rq->curr = next;
                ++*switch_count;
 
-               prepare_arch_switch(rq, next);
+               prepare_task_switch(rq, next);
                prev = context_switch(rq, prev, next);
                barrier();
-
-               finish_task_switch(prev);
+               /*
+                * this_rq must be evaluated again because prev may have moved
+                * CPUs since it called schedule(), thus the 'rq' on its stack
+                * frame will be invalid.
+                */
+               finish_task_switch(this_rq(), prev);
        } else
                spin_unlock_irq(&rq->lock);
 
@@ -4085,6 +4166,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+       idle->oncpu = 1;
+#endif
        set_tsk_need_resched(idle);
        spin_unlock_irqrestore(&rq->lock, flags);