rcu: Add call_rcu_tasks()
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 27 Jun 2014 20:42:20 +0000 (13:42 -0700)
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Sun, 7 Sep 2014 23:27:19 +0000 (16:27 -0700)
This commit adds a new RCU-tasks flavor of RCU, which provides
call_rcu_tasks().  This RCU flavor's quiescent states are voluntary
context switch (not preemption!) and userspace execution (not the idle
loop -- use some sort of schedule_on_each_cpu() if you need to handle the
idle tasks.  Note that unlike other RCU flavors, these quiescent states
occur in tasks, not necessarily CPUs.  Includes fixes from Steven Rostedt.

This RCU flavor is assumed to have very infrequent latency-tolerant
updaters.  This assumption permits significant simplifications, including
a single global callback list protected by a single global lock, along
with a single task-private linked list containing all tasks that have not
yet passed through a quiescent state.  If experience shows this assumption
to be incorrect, the required additional complexity will be added.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
include/linux/init_task.h
include/linux/rcupdate.h
include/linux/sched.h
init/Kconfig
kernel/rcu/tiny.c
kernel/rcu/tree.c
kernel/rcu/update.c

index 2bb4c4f3531ab4b724a1050bf31df9640386daaa..dffd9258ee60b61c25351c6b1d2111fe63c81d09 100644 (file)
@@ -117,6 +117,14 @@ extern struct group_info init_groups;
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
+#ifdef CONFIG_TASKS_RCU
+#define INIT_TASK_RCU_TASKS(tsk)                                       \
+       .rcu_tasks_holdout = false,                                     \
+       .rcu_tasks_holdout_list =                                       \
+               LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list),
+#else
+#define INIT_TASK_RCU_TASKS(tsk)
+#endif
 
 extern struct cred init_cred;
 
@@ -224,6 +232,7 @@ extern struct task_group root_task_group;
        INIT_FTRACE_GRAPH                                               \
        INIT_TRACE_RECURSION                                            \
        INIT_TASK_RCU_PREEMPT(tsk)                                      \
+       INIT_TASK_RCU_TASKS(tsk)                                        \
        INIT_CPUSET_SEQ(tsk)                                            \
        INIT_RT_MUTEXES(tsk)                                            \
        INIT_VTIME(tsk)                                                 \
index d231aa17b1d7490092b1994facbe21c496bead66..3432063f4c873660dcfd1402e4ef1dab94ca680c 100644 (file)
@@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head,
 
 void synchronize_sched(void);
 
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution.  As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head));
+
 #ifdef CONFIG_PREEMPT_RCU
 
 void __rcu_read_lock(void);
@@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
                rcu_irq_exit(); \
        } while (0)
 
+/*
+ * Note a voluntary context switch for RCU-tasks benefit.  This is a
+ * macro rather than an inline function to avoid #include hell.
+ */
+#ifdef CONFIG_TASKS_RCU
+#define rcu_note_voluntary_context_switch(t) \
+       do { \
+               preempt_disable(); /* Exclude synchronize_sched(); */ \
+               if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
+                       ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+               preempt_enable(); \
+       } while (0)
+#else /* #ifdef CONFIG_TASKS_RCU */
+#define rcu_note_voluntary_context_switch(t)   do { } while (0)
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
 bool __rcu_is_watching(void);
 #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
index 5c2c885ee52b3996a2665dc3d8c0e21ff9245aaf..eaacac4ae77d58eb3e6f5c237d0b3e1bd1e8f962 100644 (file)
@@ -1270,6 +1270,11 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
        struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+       unsigned long rcu_tasks_nvcsw;
+       bool rcu_tasks_holdout;
+       struct list_head rcu_tasks_holdout_list;
+#endif /* #ifdef CONFIG_TASKS_RCU */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
@@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task,
                                      unsigned int mask);
 
 #ifdef CONFIG_PREEMPT_RCU
-
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
+#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special = 0;
-#ifdef CONFIG_TREE_PREEMPT_RCU
        p->rcu_blocked_node = NULL;
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+       p->rcu_tasks_holdout = false;
+       INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+#endif /* #ifdef CONFIG_TASKS_RCU */
 }
 
-#else
-
-static inline void rcu_copy_process(struct task_struct *p)
-{
-}
-
-#endif
-
 static inline void tsk_restore_flags(struct task_struct *task,
                                unsigned long orig_flags, unsigned long flags)
 {
index e84c6423a2e5a2dbe80157b13f8b16d17c3e2d06..c4539c4e177f4c3fdadae755cc6a2b2930485cff 100644 (file)
@@ -507,6 +507,16 @@ config PREEMPT_RCU
          This option enables preemptible-RCU code that is common between
          TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
 
+config TASKS_RCU
+       bool "Task_based RCU implementation using voluntary context switch"
+       default n
+       help
+         This option enables a task-based RCU implementation that uses
+         only voluntary context switch (not preemption!), idle, and
+         user-mode execution as quiescent states.
+
+         If unsure, say N.
+
 config RCU_STALL_COMMON
        def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
        help
index d9efcc13008c00201c130f87135348c7238118ff..717f00854fc073fcb04bd9b3dde7c2e509db529e 100644 (file)
@@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user)
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
+       if (user)
+               rcu_note_voluntary_context_switch(current);
 }
 
 /*
index 1b70cb6fbe3ccda0466f3f0004865d82cdd9399d..8ad91d1e317dca532a30567645d0f192a8e93d3f 100644 (file)
@@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user)
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
+       if (user)
+               rcu_note_voluntary_context_switch(current);
        trace_rcu_utilization(TPS("End scheduler-tick"));
 }
 
index 4056d7992a6c3d86d7a41478aeb35279cd5cde66..19b3dacb0753cee633a56ae2a6b8ab8fd931d40c 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/hardirq.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 
 #define CREATE_TRACE_POINTS
 
@@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle.  As such, grace periods can take one good
+ * long time.  There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like.  Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs.  If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Post an RCU-tasks callback. */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+       unsigned long flags;
+
+       rhp->next = NULL;
+       rhp->func = func;
+       raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+       *rcu_tasks_cbs_tail = rhp;
+       rcu_tasks_cbs_tail = &rhp->next;
+       raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/* See if the current task has stopped holding out, remove from list if so. */
+static void check_holdout_task(struct task_struct *t)
+{
+       if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+           t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+           !ACCESS_ONCE(t->on_rq)) {
+               ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+               list_del_rcu(&t->rcu_tasks_holdout_list);
+               put_task_struct(t);
+       }
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+       unsigned long flags;
+       struct task_struct *g, *t;
+       struct rcu_head *list;
+       struct rcu_head *next;
+       LIST_HEAD(rcu_tasks_holdouts);
+
+       /* FIXME: Add housekeeping affinity. */
+
+       /*
+        * Each pass through the following loop makes one check for
+        * newly arrived callbacks, and, if there are some, waits for
+        * one RCU-tasks grace period and then invokes the callbacks.
+        * This loop is terminated by the system going down.  ;-)
+        */
+       for (;;) {
+
+               /* Pick up any new callbacks. */
+               raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+               list = rcu_tasks_cbs_head;
+               rcu_tasks_cbs_head = NULL;
+               rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+               raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+               /* If there were none, wait a bit and start over. */
+               if (!list) {
+                       schedule_timeout_interruptible(HZ);
+                       WARN_ON(signal_pending(current));
+                       continue;
+               }
+
+               /*
+                * Wait for all pre-existing t->on_rq and t->nvcsw
+                * transitions to complete.  Invoking synchronize_sched()
+                * suffices because all these transitions occur with
+                * interrupts disabled.  Without this synchronize_sched(),
+                * a read-side critical section that started before the
+                * grace period might be incorrectly seen as having started
+                * after the grace period.
+                *
+                * This synchronize_sched() also dispenses with the
+                * need for a memory barrier on the first store to
+                * ->rcu_tasks_holdout, as it forces the store to happen
+                * after the beginning of the grace period.
+                */
+               synchronize_sched();
+
+               /*
+                * There were callbacks, so we need to wait for an
+                * RCU-tasks grace period.  Start off by scanning
+                * the task list for tasks that are not already
+                * voluntarily blocked.  Mark these tasks and make
+                * a list of them in rcu_tasks_holdouts.
+                */
+               rcu_read_lock();
+               for_each_process_thread(g, t) {
+                       if (t != current && ACCESS_ONCE(t->on_rq) &&
+                           !is_idle_task(t)) {
+                               get_task_struct(t);
+                               t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+                               ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+                               list_add(&t->rcu_tasks_holdout_list,
+                                        &rcu_tasks_holdouts);
+                       }
+               }
+               rcu_read_unlock();
+
+               /*
+                * Each pass through the following loop scans the list
+                * of holdout tasks, removing any that are no longer
+                * holdouts.  When the list is empty, we are done.
+                */
+               while (!list_empty(&rcu_tasks_holdouts)) {
+                       schedule_timeout_interruptible(HZ);
+                       WARN_ON(signal_pending(current));
+                       rcu_read_lock();
+                       list_for_each_entry_rcu(t, &rcu_tasks_holdouts,
+                                               rcu_tasks_holdout_list)
+                               check_holdout_task(t);
+                       rcu_read_unlock();
+               }
+
+               /*
+                * Because ->on_rq and ->nvcsw are not guaranteed
+                * to have a full memory barriers prior to them in the
+                * schedule() path, memory reordering on other CPUs could
+                * cause their RCU-tasks read-side critical sections to
+                * extend past the end of the grace period.  However,
+                * because these ->nvcsw updates are carried out with
+                * interrupts disabled, we can use synchronize_sched()
+                * to force the needed ordering on all such CPUs.
+                *
+                * This synchronize_sched() also confines all
+                * ->rcu_tasks_holdout accesses to be within the grace
+                * period, avoiding the need for memory barriers for
+                * ->rcu_tasks_holdout accesses.
+                */
+               synchronize_sched();
+
+               /* Invoke the callbacks. */
+               while (list) {
+                       next = list->next;
+                       local_bh_disable();
+                       list->func(list);
+                       local_bh_enable();
+                       list = next;
+                       cond_resched();
+               }
+       }
+}
+
+/* Spawn rcu_tasks_kthread() at boot time. */
+static int __init rcu_spawn_tasks_kthread(void)
+{
+       struct task_struct __maybe_unused *t;
+
+       t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+       BUG_ON(IS_ERR(t));
+       return 0;
+}
+early_initcall(rcu_spawn_tasks_kthread);
+
+#endif /* #ifdef CONFIG_TASKS_RCU */