workqueue: reimplement CPU online rebinding to handle idle workers

author Tejun Heo <tj@kernel.org>

Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)

committer Tejun Heo <tj@kernel.org>

Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)
author Tejun Heo <tj@kernel.org>
Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)
committer Tejun Heo <tj@kernel.org>
Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index e1d05e51a80adcec89f51c6f68f7823e1a7fdd29..6927fecae412327863d4f02530edfe377a3e1b39 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -133,6 +133,7 @@ enum {
  
  struct global_cwq;
  struct worker_pool;
+struct idle_rebind;
  
  /*
   * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -154,7 +155,10 @@ struct worker {
         unsigned long           last_active;    /* L: last active timestamp */
         unsigned int            flags;          /* X: flags */
         int                     id;             /* I: worker id */
-       struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+
+       /* for rebinding worker to CPU */
+       struct idle_rebind      *idle_rebind;   /* L: for idle worker */
+       struct work_struct      rebind_work;    /* L: for busy worker */
  };
  
  struct worker_pool {
@@ -190,6 +194,8 @@ struct global_cwq {
  
         struct worker_pool      pools[2];       /* normal and highpri pools */
  
+       wait_queue_head_t       rebind_hold;    /* rebind hold wait */
+
         struct task_struct      *trustee;       /* L: for gcwq shutdown */
         unsigned int            trustee_state;  /* L: trustee state */
         wait_queue_head_t       trustee_wait;   /* trustee wait */
@@ -1314,13 +1320,37 @@ __acquires(&gcwq->lock)
         }
  }
  
+struct idle_rebind {
+       int                     cnt;            /* # workers to be rebound */
+       struct completion       done;           /* all workers rebound */
+};
+
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+       struct global_cwq *gcwq = worker->pool->gcwq;
+
+       /* CPU must be online at this point */
+       WARN_ON(!worker_maybe_bind_and_lock(worker));
+       if (!--worker->idle_rebind->cnt)
+               complete(&worker->idle_rebind->done);
+       spin_unlock_irq(&worker->pool->gcwq->lock);
+
+       /* we did our part, wait for rebind_workers() to finish up */
+       wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
+
  /*
- * Function for worker->rebind_work used to rebind unbound busy workers to
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
   * the associated cpu which is coming back online.  This is scheduled by
   * cpu up but can race with other cpu hotplug operations and may be
   * executed twice without intervening cpu down.
   */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
  {
         struct worker *worker = container_of(work, struct worker, rebind_work);
         struct global_cwq *gcwq = worker->pool->gcwq;
@@ -1331,6 +1361,112 @@ static void worker_rebind_fn(struct work_struct *work)
         spin_unlock_irq(&gcwq->lock);
  }
  
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+       __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+       struct idle_rebind idle_rebind;
+       struct worker_pool *pool;
+       struct worker *worker;
+       struct hlist_node *pos;
+       int i;
+
+       lockdep_assert_held(&gcwq->lock);
+
+       for_each_worker_pool(pool, gcwq)
+               lockdep_assert_held(&pool->manager_mutex);
+
+       /*
+        * Rebind idle workers.  Interlocked both ways.  We wait for
+        * workers to rebind via @idle_rebind.done.  Workers will wait for
+        * us to finish up by watching %WORKER_REBIND.
+        */
+       init_completion(&idle_rebind.done);
+retry:
+       idle_rebind.cnt = 1;
+       INIT_COMPLETION(idle_rebind.done);
+
+       /* set REBIND and kick idle ones, we'll wait for these later */
+       for_each_worker_pool(pool, gcwq) {
+               list_for_each_entry(worker, &pool->idle_list, entry) {
+                       if (worker->flags & WORKER_REBIND)
+                               continue;
+
+                       /* morph UNBOUND to REBIND */
+                       worker->flags &= ~WORKER_UNBOUND;
+                       worker->flags |= WORKER_REBIND;
+
+                       idle_rebind.cnt++;
+                       worker->idle_rebind = &idle_rebind;
+
+                       /* worker_thread() will call idle_worker_rebind() */
+                       wake_up_process(worker->task);
+               }
+       }
+
+       if (--idle_rebind.cnt) {
+               spin_unlock_irq(&gcwq->lock);
+               wait_for_completion(&idle_rebind.done);
+               spin_lock_irq(&gcwq->lock);
+               /* busy ones might have become idle while waiting, retry */
+               goto retry;
+       }
+
+       /*
+        * All idle workers are rebound and waiting for %WORKER_REBIND to
+        * be cleared inside idle_worker_rebind().  Clear and release.
+        * Clearing %WORKER_REBIND from this foreign context is safe
+        * because these workers are still guaranteed to be idle.
+        */
+       for_each_worker_pool(pool, gcwq)
+               list_for_each_entry(worker, &pool->idle_list, entry)
+                       worker->flags &= ~WORKER_REBIND;
+
+       wake_up_all(&gcwq->rebind_hold);
+
+       /* rebind busy workers */
+       for_each_busy_worker(worker, i, pos, gcwq) {
+               struct work_struct *rebind_work = &worker->rebind_work;
+
+               /* morph UNBOUND to REBIND */
+               worker->flags &= ~WORKER_UNBOUND;
+               worker->flags |= WORKER_REBIND;
+
+               if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                    work_data_bits(rebind_work)))
+                       continue;
+
+               /* wq doesn't matter, use the default one */
+               debug_work_activate(rebind_work);
+               insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                           worker->scheduled.next,
+                           work_color_to_flags(WORK_NO_COLOR));
+       }
+}
+
  static struct worker *alloc_worker(void)
  {
         struct worker *worker;
@@ -1339,7 +1475,7 @@ static struct worker *alloc_worker(void)
         if (worker) {
                 INIT_LIST_HEAD(&worker->entry);
                 INIT_LIST_HEAD(&worker->scheduled);
-               INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+               INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                 /* on creation a worker is in !idle && prep state */
                 worker->flags = WORKER_PREP;
         }
@@ -1829,6 +1965,9 @@ __acquires(&gcwq->lock)
  
         lockdep_copy_map(&lockdep_map, &work->lockdep_map);
  #endif
+       WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+                    raw_smp_processor_id() != gcwq->cpu);
+
         /*
          * A single work shouldn't be executed concurrently by
          * multiple workers on a single cpu.  Check whether anyone is
@@ -1946,11 +2085,20 @@ static int worker_thread(void *__worker)
  woke_up:
         spin_lock_irq(&gcwq->lock);
  
-       /* DIE can be set only while we're idle, checking here is enough */
-       if (worker->flags & WORKER_DIE) {
+       /*
+        * DIE can be set only while idle and REBIND set while busy has
+        * @worker->rebind_work scheduled.  Checking here is enough.
+        */
+       if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                 spin_unlock_irq(&gcwq->lock);
-               worker->task->flags &= ~PF_WQ_WORKER;
-               return 0;
+
+               if (worker->flags & WORKER_DIE) {
+                       worker->task->flags &= ~PF_WQ_WORKER;
+                       return 0;
+               }
+
+               idle_worker_rebind(worker);
+               goto woke_up;
         }
  
         worker_leave_idle(worker);
@@ -3468,42 +3616,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
                 }
         } while (i && rc >= 0);
  
-       /*
-        * At this point, either draining has completed and no worker
-        * is left, or cpu down has been canceled or the cpu is being
-        * brought back up.  There shouldn't be any idle one left.
-        * Tell the remaining busy ones to rebind once it finishes the
-        * currently scheduled works by scheduling the rebind_work.
-        */
-       for_each_worker_pool(pool, gcwq)
-               WARN_ON(!list_empty(&pool->idle_list));
-
-       /* if we're reassociating, clear DISASSOCIATED */
-       if (gcwq->trustee_state == TRUSTEE_RELEASE)
-               gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
-       for_each_busy_worker(worker, i, pos, gcwq) {
-               struct work_struct *rebind_work = &worker->rebind_work;
-
-               /*
-                * Rebind_work may race with future cpu hotplug
-                * operations.  Use a separate flag to mark that
-                * rebinding is scheduled.
-                */
-               worker->flags |= WORKER_REBIND;
-               worker->flags &= ~WORKER_UNBOUND;
-
-               /* queue rebind_work, wq doesn't matter, use the default one */
-               if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                    work_data_bits(rebind_work)))
-                       continue;
-
-               debug_work_activate(rebind_work);
-               insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                           worker->scheduled.next,
-                           work_color_to_flags(WORK_NO_COLOR));
-       }
-
         gcwq_release_management(gcwq);
  
         /* notify completion */
@@ -3609,13 +3721,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                         wait_trustee_state(gcwq, TRUSTEE_DONE);
                 }
  
-               /*
-                * Either DISASSOCIATED is already cleared or no worker is
-                * left on the gcwq.  Safe to clear DISASSOCIATED without
-                * claiming managers.
-                */
+               spin_unlock_irq(&gcwq->lock);
+               gcwq_claim_management(gcwq);
+               spin_lock_irq(&gcwq->lock);
+
                 gcwq->flags &= ~GCWQ_DISASSOCIATED;
  
+               rebind_workers(gcwq);
+
+               gcwq_release_management(gcwq);
+
                 /*
                  * Trustee is done and there might be no worker left.
                  * Put the first_idle in and request a real manager to
@@ -3910,6 +4025,8 @@ static int __init init_workqueues(void)
                         ida_init(&pool->worker_ida);
                 }
  
+               init_waitqueue_head(&gcwq->rebind_hold);
+
                 gcwq->trustee_state = TRUSTEE_DONE;
                 init_waitqueue_head(&gcwq->trustee_wait);
         }
author	Tejun Heo <tj@kernel.org>
	Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)
committer	Tejun Heo <tj@kernel.org>
	Tue, 17 Jul 2012 19:39:27 +0000 (12:39 -0700)