eventfd: document lockless access in eventfd_poll

author Paolo Bonzini <pbonzini@redhat.com>

Tue, 22 Mar 2016 21:27:14 +0000 (14:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 22 Mar 2016 22:36:02 +0000 (15:36 -0700)
author Paolo Bonzini <pbonzini@redhat.com>
Tue, 22 Mar 2016 21:27:14 +0000 (14:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2016 22:36:02 +0000 (15:36 -0700)
diff --git a/fs/eventfd.c b/fs/eventfd.c

index ed70cf9fdc7b3aa4e3ae6295f6071aaf87501ca7..1231cd1999d8fe96ca85ecd9fccf6e2df8d58766 100644 (file)
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
         u64 count;
  
         poll_wait(file, &ctx->wqh, wait);
-       smp_rmb();
-       count = ctx->count;
+
+       /*
+        * All writes to ctx->count occur within ctx->wqh.lock.  This read
+        * can be done outside ctx->wqh.lock because we know that poll_wait
+        * takes that lock (through add_wait_queue) if our caller will sleep.
+        *
+        * The read _can_ therefore seep into add_wait_queue's critical
+        * section, but cannot move above it!  add_wait_queue's spin_lock acts
+        * as an acquire barrier and ensures that the read be ordered properly
+        * against the writes.  The following CAN happen and is safe:
+        *
+        *     poll                               write
+        *     -----------------                  ------------
+        *     lock ctx->wqh.lock (in poll_wait)
+        *     count = ctx->count
+        *     __add_wait_queue
+        *     unlock ctx->wqh.lock
+        *                                        lock ctx->qwh.lock
+        *                                        ctx->count += n
+        *                                        if (waitqueue_active)
+        *                                          wake_up_locked_poll
+        *                                        unlock ctx->qwh.lock
+        *     eventfd_poll returns 0
+        *
+        * but the following, which would miss a wakeup, cannot happen:
+        *
+        *     poll                               write
+        *     -----------------                  ------------
+        *     count = ctx->count (INVALID!)
+        *                                        lock ctx->qwh.lock
+        *                                        ctx->count += n
+        *                                        **waitqueue_active is false**
+        *                                        **no wake_up_locked_poll!**
+        *                                        unlock ctx->qwh.lock
+        *     lock ctx->wqh.lock (in poll_wait)
+        *     __add_wait_queue
+        *     unlock ctx->wqh.lock
+        *     eventfd_poll returns 0
+        */
+       count = READ_ONCE(ctx->count);
  
         if (count > 0)
                 events |= POLLIN;
author	Paolo Bonzini <pbonzini@redhat.com>
	Tue, 22 Mar 2016 21:27:14 +0000 (14:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 22 Mar 2016 22:36:02 +0000 (15:36 -0700)