hotplug cpu: move tasks in empty cpusets to parent

author Cliff Wickman <cpw@sgi.com>

Thu, 7 Feb 2008 08:14:43 +0000 (00:14 -0800)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
author Cliff Wickman <cpw@sgi.com>
Thu, 7 Feb 2008 08:14:43 +0000 (00:14 -0800)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 8675c691d3e20fdf4ee9a55a0ca525ff6bcbc848..ff9055fc3d2a046a75e78b8c967b5c732ee7eb24 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -318,6 +318,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont,
                                         struct cgroup_iter *it);
  void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
  int cgroup_scan_tasks(struct cgroup_scanner *scan);
+int cgroup_attach_task(struct cgroup *, struct task_struct *);
  
  #else /* !CONFIG_CGROUPS */
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index bcc7a6e8e3c00dd69d462d3bc87299e9bf0ee095..2c5cccbe12e2a7ac0cac841f8320dc21693c205f 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
   * Any task can increment and decrement the count field without lock.
   * So in general, code holding cgroup_mutex can't rely on the count
   * field not changing.  However, if the count goes to zero, then only
- * attach_task() can increment it again.  Because a count of zero
+ * cgroup_attach_task() can increment it again.  Because a count of zero
   * means that no tasks are currently attached, therefore there is no
   * way a task attached to that cgroup can fork (the other way to
   * increment the count).  So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
   *     The task_lock() exception
   *
   * The need for this exception arises from the action of
- * attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
   * another.  It does so using cgroup_mutexe, however there are
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
- * in attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
   * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cgroup pointer by attach_task()
+ * update of a tasks cgroup pointer by cgroup_attach_task()
   */
  
  /**
@@ -1194,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
   * Call holding cgroup_mutex.  May take task_lock of
   * the task 'pid' during call.
   */
-static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
  {
         int retval = 0;
         struct cgroup_subsys *ss;
@@ -1287,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
                 get_task_struct(tsk);
         }
  
-       ret = attach_task(cgrp, tsk);
+       ret = cgroup_attach_task(cgrp, tsk);
         put_task_struct(tsk);
         return ret;
  }
@@ -2514,7 +2514,7 @@ out:
   *  - Used for /proc/<pid>/cgroup.
   *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
   *    doesn't really matter if tsk->cgroup changes after we read it,
- *    and we take cgroup_mutex, keeping attach_task() from changing it
+ *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
   *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
   *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
   *    cgroup to top_cgroup.
@@ -2625,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
   * A pointer to the shared css_set was automatically copied in
   * fork.c by dup_task_struct().  However, we ignore that copy, since
   * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  attach_task() might
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
   * have already changed current->cgroups, allowing the previously
   * referenced cgroup group to be removed and freed.
   *
@@ -2704,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
   *    attach us to a different cgroup, decrementing the count on
   *    the first cgroup that we never incremented.  But in this case,
   *    top_cgroup isn't going away, and either task has PF_EXITING set,
- *    which wards off any attach_task() attempts, or task is a failed
- *    fork, never visible to attach_task.
+ *    which wards off any cgroup_attach_task() attempts, or task is a failed
+ *    fork, never visible to cgroup_attach_task.
   *
   */
  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2845,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
         }
  
         /* All seems fine. Finish by moving the task into the new cgroup */
-       ret = attach_task(child, tsk);
+       ret = cgroup_attach_task(child, tsk);
         mutex_unlock(&cgroup_mutex);
  
   out_release:
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index cfaf6419d817e0a387f1c39d878f342e279818b6..d94a8f7c4c294454366ea4bdc116fbdee7f3f840 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,8 @@
  #include <asm/atomic.h>
  #include <linux/mutex.h>
  #include <linux/kfifo.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
  
  /*
   * Tracks how many cpusets are currently defined in system.
@@ -96,6 +98,9 @@ struct cpuset {
  
         /* partition number for rebuild_sched_domains() */
         int pn;
+
+       /* used for walking a cpuset heirarchy */
+       struct list_head stack_list;
  };
  
  /* Retrieve the cpuset for a cgroup */
@@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
         return container_of(task_subsys_state(task, cpuset_subsys_id),
                             struct cpuset, css);
  }
-
+struct cpuset_hotplug_scanner {
+       struct cgroup_scanner scan;
+       struct cgroup *to;
+};
  
  /* bits in struct cpuset flags field */
  typedef enum {
@@ -1687,53 +1695,146 @@ int __init cpuset_init(void)
         return 0;
  }
  
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+       struct cpuset_hotplug_scanner *chsp;
+
+       chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+       cgroup_attach_task(chsp->to, tsk);
+}
+
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with manage_sem held
+ * callback_mutex must not be held, as attach_task() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+       struct cpuset_hotplug_scanner scan;
+
+       scan.scan.cg = from->css.cgroup;
+       scan.scan.test_task = NULL; /* select all tasks in cgroup */
+       scan.scan.process_task = cpuset_do_move_task;
+       scan.scan.heap = NULL;
+       scan.to = to->css.cgroup;
+
+       if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+               printk(KERN_ERR "move_member_tasks_to_cpuset: "
+                               "cgroup_scan_tasks failed\n");
+}
+
  /*
   * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
- *
- * This routine is intentionally inefficient in a couple of regards.
- * It will check all cpusets in a subtree even if the top cpuset of
- * the subtree has no offline CPUs or nodes.  It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
   *
- * Call with both manage_mutex and callback_mutex held.
+ * The parent cpuset has some superset of the 'mems' nodes that the
+ * newly empty cpuset held, so no migration of memory is necessary.
   *
- * Recursive, on depth of cpuset subtree.
+ * Called with both manage_sem and callback_sem held
   */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+       struct cpuset *parent;
+
+       /* the cgroup's css_sets list is in use if there are tasks
+          in the cpuset; the list is empty if there are none;
+          the cs->css.refcnt seems always 0 */
+       if (list_empty(&cs->css.cgroup->css_sets))
+               return;
  
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+       /*
+        * Find its next-highest non-empty parent, (top cpuset
+        * has online cpus, so can't be empty).
+        */
+       parent = cs->parent;
+       while (cpus_empty(parent->cpus_allowed)) {
+               /*
+                * this empty cpuset should now be considered to
+                * have been used, and therefore eligible for
+                * release when empty (if it is notify_on_release)
+                */
+               parent = parent->parent;
+       }
+
+       move_member_tasks_to_cpuset(cs, parent);
+}
+
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
+ *
+ * Note that such a notify_on_release cpuset must have had, at some time,
+ * member tasks or cpuset descendants and cpus and memory, before it can
+ * be a candidate for release.
+ *
+ * Called with manage_mutex held.  We take callback_mutex to modify
+ * cpus_allowed and mems_allowed.
+ *
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next.  It always processes a node before
+ * any of its children.
+ *
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'.  But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ */
+static void scan_for_empty_cpusets(const struct cpuset *root)
  {
+       struct cpuset *cp;      /* scans cpusets being updated */
+       struct cpuset *child;   /* scans child cpusets of cp */
+       struct list_head queue;
         struct cgroup *cont;
-       struct cpuset *c;
  
-       /* Each of our child cpusets mems must be online */
-       list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
-               c = cgroup_cs(cont);
-               guarantee_online_cpus_mems_in_subtree(c);
-               if (!cpus_empty(c->cpus_allowed))
-                       guarantee_online_cpus(c, &c->cpus_allowed);
-               if (!nodes_empty(c->mems_allowed))
-                       guarantee_online_mems(c, &c->mems_allowed);
+       INIT_LIST_HEAD(&queue);
+
+       list_add_tail((struct list_head *)&root->stack_list, &queue);
+
+       mutex_lock(&callback_mutex);
+       while (!list_empty(&queue)) {
+               cp = container_of(queue.next, struct cpuset, stack_list);
+               list_del(queue.next);
+               list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                       child = cgroup_cs(cont);
+                       list_add_tail(&child->stack_list, &queue);
+               }
+               cont = cp->css.cgroup;
+               /* Remove offline cpus and mems from this cpuset. */
+               cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+               nodes_and(cp->mems_allowed, cp->mems_allowed,
+                                               node_states[N_HIGH_MEMORY]);
+               if ((cpus_empty(cp->cpus_allowed) ||
+                    nodes_empty(cp->mems_allowed))) {
+                       /* Move tasks from the empty cpuset to a parent */
+                       mutex_unlock(&callback_mutex);
+                       remove_tasks_in_empty_cpuset(cp);
+                       mutex_lock(&callback_mutex);
+               }
         }
+       mutex_unlock(&callback_mutex);
+       return;
  }
  
  /*
   * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
   * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug
- * event.
- *
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ * track what's online after any CPU or memory node hotplug or unplug event.
   *
   * Since there are two callers of this routine, one for CPU hotplug
   * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
  static void common_cpu_mem_hotplug_unplug(void)
  {
         cgroup_lock();
-       mutex_lock(&callback_mutex);
  
-       guarantee_online_cpus_mems_in_subtree(&top_cpuset);
         top_cpuset.cpus_allowed = cpu_online_map;
         top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+       scan_for_empty_cpusets(&top_cpuset);
  
-       mutex_unlock(&callback_mutex);
         cgroup_unlock();
  }
author	Cliff Wickman <cpw@sgi.com>
	Thu, 7 Feb 2008 08:14:43 +0000 (00:14 -0800)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
include/linux/cgroup.h		patch \| blob \| blame \| history
kernel/cgroup.c		patch \| blob \| blame \| history
kernel/cpuset.c		patch \| blob \| blame \| history