Merge remote-tracking branch 'tip/smp/hotplug' into next.2012.09.25b

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / rcutree_plugin.h
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h

index c1961aed12138a39e04ee352d1a070dd238dceb1..9c71c1b18e0359a6e5d36639804abc325146f09f 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
   */
  
  #include <linux/delay.h>
+#include <linux/oom.h>
  #include <linux/smpboot.h>
  
  #define RCU_KTHREAD_PRIO 1
@@ -119,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
   */
  void rcu_force_quiescent_state(void)
  {
-       force_quiescent_state(&rcu_preempt_state, 0);
+       force_quiescent_state(&rcu_preempt_state);
  }
  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  
@@ -137,8 +138,6 @@ static void rcu_preempt_qs(int cpu)
  {
         struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
  
-       rdp->passed_quiesce_gpnum = rdp->gpnum;
-       barrier();
         if (rdp->passed_quiesce == 0)
                 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
         rdp->passed_quiesce = 1;
@@ -423,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
         unsigned long flags;
         struct task_struct *t;
  
-       if (!rcu_preempt_blocked_readers_cgp(rnp))
-               return;
         raw_spin_lock_irqsave(&rnp->lock, flags);
+       if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               return;
+       }
         t = list_entry(rnp->gp_tasks,
                        struct task_struct, rcu_node_entry);
         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -585,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
         }
  
+       rnp->gp_tasks = NULL;
+       rnp->exp_tasks = NULL;
  #ifdef CONFIG_RCU_BOOST
-       /* In case root is being boosted and leaf is not. */
+       rnp->boost_tasks = NULL;
+       /*
+        * In case root is being boosted and leaf was not.  Make sure
+        * that we boost the tasks blocking the current grace period
+        * in this case.
+        */
         raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
         if (rnp_root->boost_tasks != NULL &&
-           rnp_root->boost_tasks != rnp_root->gp_tasks)
+           rnp_root->boost_tasks != rnp_root->gp_tasks &&
+           rnp_root->boost_tasks != rnp_root->exp_tasks)
                 rnp_root->boost_tasks = rnp_root->gp_tasks;
         raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
  #endif /* #ifdef CONFIG_RCU_BOOST */
  
-       rnp->gp_tasks = NULL;
-       rnp->exp_tasks = NULL;
         return retval;
  }
  
@@ -677,7 +684,7 @@ void synchronize_rcu(void)
  EXPORT_SYMBOL_GPL(synchronize_rcu);
  
  static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static long sync_rcu_preempt_exp_count;
+static unsigned long sync_rcu_preempt_exp_count;
  static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
  
  /*
@@ -792,41 +799,55 @@ void synchronize_rcu_expedited(void)
         unsigned long flags;
         struct rcu_node *rnp;
         struct rcu_state *rsp = &rcu_preempt_state;
-       long snap;
+       unsigned long snap;
         int trycount = 0;
  
         smp_mb(); /* Caller's modifications seen first by other CPUs. */
         snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
         smp_mb(); /* Above access cannot bleed into critical section. */
  
+       /*
+        * Block CPU-hotplug operations.  This means that any CPU-hotplug
+        * operation that finds an rcu_node structure with tasks in the
+        * process of being boosted will know that all tasks blocking
+        * this expedited grace period will already be in the process of
+        * being boosted.  This simplifies the process of moving tasks
+        * from leaf to root rcu_node structures.
+        */
+       get_online_cpus();
+
         /*
          * Acquire lock, falling back to synchronize_rcu() if too many
          * lock-acquisition failures.  Of course, if someone does the
          * expedited grace period for us, just leave.
          */
         while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+               if (ULONG_CMP_LT(snap,
+                   ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+                       put_online_cpus();
+                       goto mb_ret; /* Others did our work for us. */
+               }
                 if (trycount++ < 10) {
                         udelay(trycount * num_online_cpus());
                 } else {
+                       put_online_cpus();
                         synchronize_rcu();
                         return;
                 }
-               if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
-                       goto mb_ret; /* Others did our work for us. */
         }
-       if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+       if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+               put_online_cpus();
                 goto unlock_mb_ret; /* Others did our work for us. */
+       }
  
         /* force all RCU readers onto ->blkd_tasks lists. */
         synchronize_sched_expedited();
  
-       raw_spin_lock_irqsave(&rsp->onofflock, flags);
-
         /* Initialize ->expmask for all non-leaf rcu_node structures. */
         rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+               raw_spin_lock_irqsave(&rnp->lock, flags);
                 rnp->expmask = rnp->qsmaskinit;
-               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
         }
  
         /* Snapshot current state of ->blkd_tasks lists. */
@@ -835,7 +856,7 @@ void synchronize_rcu_expedited(void)
         if (NUM_RCU_NODES > 1)
                 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
  
-       raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+       put_online_cpus();
  
         /* Wait for snapshotted ->blkd_tasks lists to drain. */
         rnp = rcu_get_root(rsp);
@@ -1192,9 +1213,9 @@ static int rcu_boost_kthread(void *arg)
   * kthread to start boosting them.  If there is an expedited grace
   * period in progress, it is always time to boost.
   *
- * The caller must hold rnp->lock, which this function releases,
- * but irqs remain disabled.  The ->boost_kthread_task is immortal,
- * so we don't need to worry about it going away.
+ * The caller must hold rnp->lock, which this function releases.
+ * The ->boost_kthread_task is immortal, so we don't need to worry
+ * about it going away.
   */
  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
  {
@@ -1814,16 +1835,16 @@ static void rcu_prepare_for_idle(int cpu)
  #ifdef CONFIG_TREE_PREEMPT_RCU
         if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
                 rcu_preempt_qs(cpu);
-               force_quiescent_state(&rcu_preempt_state, 0);
+               force_quiescent_state(&rcu_preempt_state);
         }
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
         if (per_cpu(rcu_sched_data, cpu).nxtlist) {
                 rcu_sched_qs(cpu);
-               force_quiescent_state(&rcu_sched_state, 0);
+               force_quiescent_state(&rcu_sched_state);
         }
         if (per_cpu(rcu_bh_data, cpu).nxtlist) {
                 rcu_bh_qs(cpu);
-               force_quiescent_state(&rcu_bh_state, 0);
+               force_quiescent_state(&rcu_bh_state);
         }
  
         /*
@@ -1851,6 +1872,88 @@ static void rcu_idle_count_callbacks_posted(void)
         __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
  }
  
+/*
+ * Data for flushing lazy RCU callbacks at OOM time.
+ */
+static atomic_t oom_callback_count;
+static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
+
+/*
+ * RCU OOM callback -- decrement the outstanding count and deliver the
+ * wake-up if we are the last one.
+ */
+static void rcu_oom_callback(struct rcu_head *rhp)
+{
+       if (atomic_dec_and_test(&oom_callback_count))
+               wake_up(&oom_callback_wq);
+}
+
+/*
+ * Post an rcu_oom_notify callback on the current CPU if it has at
+ * least one lazy callback.  This will unnecessarily post callbacks
+ * to CPUs that already have a non-lazy callback at the end of their
+ * callback list, but this is an infrequent operation, so accept some
+ * extra overhead to keep things simple.
+ */
+static void rcu_oom_notify_cpu(void *unused)
+{
+       struct rcu_state *rsp;
+       struct rcu_data *rdp;
+
+       for_each_rcu_flavor(rsp) {
+               rdp = __this_cpu_ptr(rsp->rda);
+               if (rdp->qlen_lazy != 0) {
+                       atomic_inc(&oom_callback_count);
+                       rsp->call(&rdp->oom_head, rcu_oom_callback);
+               }
+       }
+}
+
+/*
+ * If low on memory, ensure that each CPU has a non-lazy callback.
+ * This will wake up CPUs that have only lazy callbacks, in turn
+ * ensuring that they free up the corresponding memory in a timely manner.
+ * Because an uncertain amount of memory will be freed in some uncertain
+ * timeframe, we do not claim to have freed anything.
+ */
+static int rcu_oom_notify(struct notifier_block *self,
+                         unsigned long notused, void *nfreed)
+{
+       int cpu;
+
+       /* Wait for callbacks from earlier instance to complete. */
+       wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+
+       /*
+        * Prevent premature wakeup: ensure that all increments happen
+        * before there is a chance of the counter reaching zero.
+        */
+       atomic_set(&oom_callback_count, 1);
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
+               cond_resched();
+       }
+       put_online_cpus();
+
+       /* Unconditionally decrement: no need to wake ourselves up. */
+       atomic_dec(&oom_callback_count);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_oom_nb = {
+       .notifier_call = rcu_oom_notify
+};
+
+static int __init rcu_register_oom_notifier(void)
+{
+       register_oom_notifier(&rcu_oom_nb);
+       return 0;
+}
+early_initcall(rcu_register_oom_notifier);
+
  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
  
  #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -1861,11 +1964,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
  {
         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
         struct timer_list *tltp = &rdtp->idle_gp_timer;
+       char c;
  
-       sprintf(cp, "drain=%d %c timer=%lu",
-               rdtp->dyntick_drain,
-               rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
-               timer_pending(tltp) ? tltp->expires - jiffies : -1);
+       c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
+       if (timer_pending(tltp))
+               sprintf(cp, "drain=%d %c timer=%lu",
+                       rdtp->dyntick_drain, c, tltp->expires - jiffies);
+       else
+               sprintf(cp, "drain=%d %c timer not pending",
+                       rdtp->dyntick_drain, c);
  }
  
  #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -1933,11 +2040,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
  /* Increment ->ticks_this_gp for all flavors of RCU. */
  static void increment_cpu_stall_ticks(void)
  {
-       __get_cpu_var(rcu_sched_data).ticks_this_gp++;
-       __get_cpu_var(rcu_bh_data).ticks_this_gp++;
-#ifdef CONFIG_TREE_PREEMPT_RCU
-       __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+       struct rcu_state *rsp;
+
+       for_each_rcu_flavor(rsp)
+               __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
  }
  
  #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */