epoll: support for disabling items, and a self-test app
authorPaton J. Lewis <palewis@adobe.com>
Fri, 5 Oct 2012 00:13:39 +0000 (17:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Oct 2012 18:05:00 +0000 (03:05 +0900)
Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll
item.  If epoll_ctl doesn't return -EBUSY in this case, it is then safe to
delete the epoll item in a multi-threaded environment.  Also added a new
test_epoll self- test app to both demonstrate the need for this feature
and test it.

Signed-off-by: Paton J. Lewis <palewis@adobe.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Paul Holland <pholland@adobe.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/eventpoll.c
include/linux/eventpoll.h
tools/testing/selftests/Makefile
tools/testing/selftests/epoll/Makefile [new file with mode: 0644]
tools/testing/selftests/epoll/test_epoll.c [new file with mode: 0644]

index cd96649bfe62da9e408dbd629f56f6452c139bc9..da72250ddc1cf2331336a23cbe8239bf880a10cf 100644 (file)
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-       return op != EPOLL_CTL_DEL;
+       return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
 }
 
 /* Initialize the poll safe wake up structure */
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        return 0;
 }
 
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+       int result = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->lock, flags);
+       if (epi->event.events & ~EP_PRIVATE_BITS) {
+               if (ep_is_linked(&epi->rdllink))
+                       list_del_init(&epi->rdllink);
+               /* Ensure ep_poll_callback will not add epi back onto ready
+                  list: */
+               epi->event.events &= EP_PRIVATE_BITS;
+               }
+       else
+               result = -EBUSY;
+       spin_unlock_irqrestore(&ep->lock, flags);
+
+       return result;
+}
+
 static void ep_free(struct eventpoll *ep)
 {
        struct rb_node *rbp;
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
-
-
 #define PATH_ARR_SIZE 5
 /*
  * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                } else
                        error = -ENOENT;
                break;
+       case EPOLL_CTL_DISABLE:
+               if (epi)
+                       error = ep_disable(ep, epi);
+               else
+                       error = -ENOENT;
+               break;
        }
        mutex_unlock(&ep->mtx);
 
index f4bb378ccf6a355bbe49e79f56019f9ef386d1d4..41085d0f3955b3ff247b426de9452b731d00c19b 100644 (file)
@@ -25,6 +25,7 @@
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_DISABLE 4
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
index 85baf11e2acd7d11aa4990a0f7f53f8d28689a20..43480149119ee773f0a32cc8abe995903609fd11 100644 (file)
@@ -1,4 +1,4 @@
-TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug
+TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll
 
 all:
        for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644 (file)
index 0000000..19806ed
--- /dev/null
@@ -0,0 +1,11 @@
+# Makefile for epoll selftests
+
+all: test_epoll
+%: %.c
+       gcc -pthread -g -o $@ $^
+
+run_tests: all
+       ./test_epoll
+
+clean:
+       $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644 (file)
index 0000000..e0fcff1
--- /dev/null
@@ -0,0 +1,344 @@
+/*
+ *  tools/testing/selftests/epoll/test_epoll.c
+ *
+ *  Copyright 2012 Adobe Systems Incorporated
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Paton J. Lewis <palewis@adobe.com>
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+/*
+ * A pointer to an epoll_item_private structure will be stored in the epoll
+ * item's event structure so that we can get access to the epoll_item_private
+ * data after calling epoll_wait:
+ */
+struct epoll_item_private {
+       int index;  /* Position of this struct within the epoll_items array. */
+       int fd;
+       uint32_t events;
+       pthread_mutex_t mutex;  /* Guards the following variables... */
+       int stop;
+       int status;  /* Stores any error encountered while handling item. */
+       /* The following variable allows us to test whether we have encountered
+          a problem while attempting to cancel and delete the associated
+          event. When the test program exits, 'deleted' should be exactly
+          one. If it is greater than one, then the failed test reflects a real
+          world situation where we would have tried to access the epoll item's
+          private data after deleting it: */
+       int deleted;
+};
+
+struct epoll_item_private *epoll_items;
+
+/*
+ * Delete the specified item from the epoll set. In a real-world secneario this
+ * is where we would free the associated data structure, but in this testing
+ * environment we retain the structure so that we can test for double-deletion:
+ */
+void delete_item(int index)
+{
+       __sync_fetch_and_add(&epoll_items[index].deleted, 1);
+}
+
+/*
+ * A pointer to a read_thread_data structure will be passed as the argument to
+ * each read thread:
+ */
+struct read_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the read thread. */
+       int epoll_set;
+};
+
+/*
+ * The function executed by the read threads:
+ */
+void *read_thread_function(void *function_data)
+{
+       struct read_thread_data *thread_data =
+               (struct read_thread_data *)function_data;
+       struct epoll_event event_data;
+       struct epoll_item_private *item_data;
+       char socket_data;
+
+       /* Handle events until we encounter an error or this thread's 'stop'
+          condition is set: */
+       while (1) {
+               int result = epoll_wait(thread_data->epoll_set,
+                                       &event_data,
+                                       1,      /* Number of desired events */
+                                       1000);  /* Timeout in ms */
+               if (result < 0) {
+                       /* Breakpoints signal all threads. Ignore that while
+                          debugging: */
+                       if (errno == EINTR)
+                               continue;
+                       thread_data->status = errno;
+                       return 0;
+               } else if (thread_data->stop)
+                       return 0;
+               else if (result == 0)  /* Timeout */
+                       continue;
+
+               /* We need the mutex here because checking for the stop
+                  condition and re-enabling the epoll item need to be done
+                  together as one atomic operation when EPOLL_CTL_DISABLE is
+                  available: */
+               item_data = (struct epoll_item_private *)event_data.data.ptr;
+               pthread_mutex_lock(&item_data->mutex);
+
+               /* Remove the item from the epoll set if we want to stop
+                  handling that event: */
+               if (item_data->stop)
+                       delete_item(item_data->index);
+               else {
+                       /* Clear the data that was written to the other end of
+                          our non-blocking socket: */
+                       do {
+                               if (read(item_data->fd, &socket_data, 1) < 1) {
+                                       if ((errno == EAGAIN) ||
+                                           (errno == EWOULDBLOCK))
+                                               break;
+                                       else
+                                               goto error_unlock;
+                               }
+                       } while (item_data->events & EPOLLET);
+
+                       /* The item was one-shot, so re-enable it: */
+                       event_data.events = item_data->events;
+                       if (epoll_ctl(thread_data->epoll_set,
+                                                 EPOLL_CTL_MOD,
+                                                 item_data->fd,
+                                                 &event_data) < 0)
+                               goto error_unlock;
+               }
+
+               pthread_mutex_unlock(&item_data->mutex);
+       }
+
+error_unlock:
+       thread_data->status = item_data->status = errno;
+       pthread_mutex_unlock(&item_data->mutex);
+       return 0;
+}
+
+/*
+ * A pointer to a write_thread_data structure will be passed as the argument to
+ * the write thread:
+ */
+struct write_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the write thread. */
+       int n_fds;
+       int *fds;
+};
+
+/*
+ * The function executed by the write thread. It writes a single byte to each
+ * socket in turn until the stop condition for this thread is set. If writing to
+ * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
+ * the moment and just move on to the next socket in the list. We don't care
+ * about the order in which we deliver events to the epoll set. In fact we don't
+ * care about the data we're writing to the pipes at all; we just want to
+ * trigger epoll events:
+ */
+void *write_thread_function(void *function_data)
+{
+       const char data = 'X';
+       int index;
+       struct write_thread_data *thread_data =
+               (struct write_thread_data *)function_data;
+       while (!write_thread_data->stop)
+               for (index = 0;
+                    !thread_data->stop && (index < thread_data->n_fds);
+                    ++index)
+                       if ((write(thread_data->fds[index], &data, 1) < 1) &&
+                               (errno != EAGAIN) &&
+                               (errno != EWOULDBLOCK)) {
+                               write_thread_data->status = errno;
+                               return;
+                       }
+}
+
+/*
+ * Arguments are currently ignored:
+ */
+int main(int argc, char **argv)
+{
+       const int n_read_threads = 100;
+       const int n_epoll_items = 500;
+       int index;
+       int epoll_set = epoll_create1(0);
+       struct write_thread_data write_thread_data = {
+               0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
+       };
+       struct read_thread_data *read_thread_data =
+               malloc(n_read_threads * sizeof(struct read_thread_data));
+       pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
+       pthread_t write_thread;
+
+       printf("-----------------\n");
+       printf("Runing test_epoll\n");
+       printf("-----------------\n");
+
+       epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
+
+       if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
+               read_thread_data == 0 || read_threads == 0)
+               goto error;
+
+       if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
+               printf("Error: please run this test on a multi-core system.\n");
+               goto error;
+       }
+
+       /* Create the socket pairs and epoll items: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               int socket_pair[2];
+               struct epoll_event event_data;
+               if (socketpair(AF_UNIX,
+                              SOCK_STREAM | SOCK_NONBLOCK,
+                              0,
+                              socket_pair) < 0)
+                       goto error;
+               write_thread_data.fds[index] = socket_pair[0];
+               epoll_items[index].index = index;
+               epoll_items[index].fd = socket_pair[1];
+               if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
+                       goto error;
+               /* We always use EPOLLONESHOT because this test is currently
+                  structured to demonstrate the need for EPOLL_CTL_DISABLE,
+                  which only produces useful information in the EPOLLONESHOT
+                  case (without EPOLLONESHOT, calling epoll_ctl with
+                  EPOLL_CTL_DISABLE will never return EBUSY). If support for
+                  testing events without EPOLLONESHOT is desired, it should
+                  probably be implemented in a separate unit test. */
+               epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
+               if (index < n_epoll_items / 2)
+                       epoll_items[index].events |= EPOLLET;
+               epoll_items[index].stop = 0;
+               epoll_items[index].status = 0;
+               epoll_items[index].deleted = 0;
+               event_data.events = epoll_items[index].events;
+               event_data.data.ptr = &epoll_items[index];
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_ADD,
+                             epoll_items[index].fd,
+                             &event_data) < 0)
+                       goto error;
+       }
+
+       /* Create and start the read threads: */
+       for (index = 0; index < n_read_threads; ++index) {
+               read_thread_data[index].stop = 0;
+               read_thread_data[index].status = 0;
+               read_thread_data[index].epoll_set = epoll_set;
+               if (pthread_create(&read_threads[index],
+                                  NULL,
+                                  read_thread_function,
+                                  &read_thread_data[index]) != 0)
+                       goto error;
+       }
+
+       if (pthread_create(&write_thread,
+                          NULL,
+                          write_thread_function,
+                          &write_thread_data) != 0)
+               goto error;
+
+       /* Cancel all event pollers: */
+#ifdef EPOLL_CTL_DISABLE
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_DISABLE,
+                             epoll_items[index].fd,
+                             NULL) == 0)
+                       delete_item(index);
+               else if (errno != EBUSY) {
+                       pthread_mutex_unlock(&epoll_items[index].mutex);
+                       goto error;
+               }
+               /* EBUSY means events were being handled; allow the other thread
+                  to delete the item. */
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#else
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+               /* Wait in case a thread running read_thread_function is
+                  currently executing code between epoll_wait and
+                  pthread_mutex_lock with this item. Note that a longer delay
+                  would make double-deletion less likely (at the expense of
+                  performance), but there is no guarantee that any delay would
+                  ever be sufficient. Note also that we delete all event
+                  pollers at once for testing purposes, but in a real-world
+                  environment we are likely to want to be able to cancel event
+                  pollers at arbitrary times. Therefore we can't improve this
+                  situation by just splitting this loop into two loops
+                  (i.e. signal 'stop' for all items, sleep, and then delete all
+                  items). We also can't fix the problem via EPOLL_CTL_DEL
+                  because that command can't prevent the case where some other
+                  thread is executing read_thread_function within the region
+                  mentioned above: */
+               usleep(1);
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               if (!epoll_items[index].deleted)
+                       delete_item(index);
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#endif
+
+       /* Shut down the read threads: */
+       for (index = 0; index < n_read_threads; ++index)
+               __sync_fetch_and_add(&read_thread_data[index].stop, 1);
+       for (index = 0; index < n_read_threads; ++index) {
+               if (pthread_join(read_threads[index], NULL) != 0)
+                       goto error;
+               if (read_thread_data[index].status)
+                       goto error;
+       }
+
+       /* Shut down the write thread: */
+       __sync_fetch_and_add(&write_thread_data.stop, 1);
+       if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
+               goto error;
+
+       /* Check for final error conditions: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               if (epoll_items[index].status != 0)
+                       goto error;
+               if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
+                       goto error;
+       }
+       for (index = 0; index < n_epoll_items; ++index)
+               if (epoll_items[index].deleted != 1) {
+                       printf("Error: item data deleted %1d times.\n",
+                                  epoll_items[index].deleted);
+                       goto error;
+               }
+
+       printf("[PASS]\n");
+       return 0;
+
+ error:
+       printf("[FAIL]\n");
+       return errno;
+}