sched/wait, RCU: Introduce rcuwait machinery
authorDavidlohr Bueso <dave@stgolabs.net>
Wed, 11 Jan 2017 15:22:25 +0000 (07:22 -0800)
committerIngo Molnar <mingo@kernel.org>
Sat, 14 Jan 2017 10:14:33 +0000 (11:14 +0100)
rcuwait provides support for (single) RCU-safe task wait/wake functionality,
with the caveat that it must not be called after exit_notify(), such that
we avoid racing with rcu delayed_put_task_struct callbacks, task_struct
being rcu unaware in this context -- for which we similarly have
task_rcu_dereference() magic, but with different return semantics, which
can conflict with the wakeup side.

The interfaces are quite straightforward:

  rcuwait_wait_event()
  rcuwait_wake_up()

More details are in the comments, but it's perhaps worth mentioning at least,
that users must provide proper serialization when waiting on a condition, and
avoid corrupting a concurrent waiter. Also care must be taken between the task
and the condition for when calling the wakeup -- we cannot miss wakeups. When
porting users, this is for example, a given when using waitqueues in that
everything is done under the q->lock. As such, it can remove sources of non
preemptable unbounded work for realtime.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/rcuwait.h [new file with mode: 0644]
kernel/exit.c

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
new file mode 100644 (file)
index 0000000..0e93d56
--- /dev/null
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+       struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name)            \
+       { .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+       w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition)                               \
+({                                                                     \
+       /*                                                              \
+        * Complain if we are called after do_exit()/exit_notify(),     \
+        * as we cannot rely on the rcu critical region for the         \
+        * wakeup side.                                                 \
+        */                                                             \
+       WARN_ON(current->exit_state);                                   \
+                                                                       \
+       rcu_assign_pointer((w)->task, current);                         \
+       for (;;) {                                                      \
+               /*                                                      \
+                * Implicit barrier (A) pairs with (B) in               \
+                * rcuwait_trywake().                                   \
+                */                                                     \
+               set_current_state(TASK_UNINTERRUPTIBLE);                \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               schedule();                                             \
+       }                                                               \
+                                                                       \
+       WRITE_ONCE((w)->task, NULL);                                    \
+       __set_current_state(TASK_RUNNING);                              \
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
index 27c68653e2fcd7092028f38ac33126ba667aeae7..a9441da69e296fa15ad9069778e3d03f6dc8910c 100644 (file)
@@ -55,6 +55,7 @@
 #include <linux/shm.h>
 #include <linux/kcov.h>
 #include <linux/random.h>
+#include <linux/rcuwait.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -282,6 +283,35 @@ retry:
        return task;
 }
 
+void rcuwait_wake_up(struct rcuwait *w)
+{
+       struct task_struct *task;
+
+       rcu_read_lock();
+
+       /*
+        * Order condition vs @task, such that everything prior to the load
+        * of @task is visible. This is the condition as to why the user called
+        * rcuwait_trywake() in the first place. Pairs with set_current_state()
+        * barrier (A) in rcuwait_wait_event().
+        *
+        *    WAIT                WAKE
+        *    [S] tsk = current   [S] cond = true
+        *        MB (A)              MB (B)
+        *    [L] cond            [L] tsk
+        */
+       smp_rmb(); /* (B) */
+
+       /*
+        * Avoid using task_rcu_dereference() magic as long as we are careful,
+        * see comment in rcuwait_wait_event() regarding ->exit_state.
+        */
+       task = rcu_dereference(w->task);
+       if (task)
+               wake_up_process(task);
+       rcu_read_unlock();
+}
+
 struct task_struct *try_get_task_struct(struct task_struct **ptask)
 {
        struct task_struct *task;