cgroup: use css_set->mg_tasks to track target tasks during migration

author Tejun Heo <tj@kernel.org>

Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)

committer Tejun Heo <tj@kernel.org>

Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)
author Tejun Heo <tj@kernel.org>
Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)
committer Tejun Heo <tj@kernel.org>
Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 528e2aed36c31f23be16a72c0e88e12de5863bba..3a1cb265afd65a00b9fe7d6198be0a1e5f089c05 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -346,6 +346,22 @@ struct css_set {
          */
         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
  
+       /*
+        * List of csets participating in the on-going migration either as
+        * source or destination.  Protected by cgroup_mutex.
+        */
+       struct list_head mg_node;
+
+       /*
+        * If this cset is acting as the source of migration the following
+        * two fields are set.  mg_src_cgrp is the source cgroup of the
+        * on-going migration and mg_dst_cset is the destination cset the
+        * target tasks on this cset should be migrated to.  Protected by
+        * cgroup_mutex.
+        */
+       struct cgroup *mg_src_cgrp;
+       struct css_set *mg_dst_cset;
+
         /* For RCU-protected deletion */
         struct rcu_head rcu_head;
  };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index b80c611ff836d4b029b4cf021de4a8216d57d667..5def4a8004257e0737c6ba334ad043e3e837b7b8 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/flex_array.h> /* used in cgroup_attach_task */
  #include <linux/kthread.h>
  #include <linux/delay.h>
  
@@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
         INIT_LIST_HEAD(&cset->cgrp_links);
         INIT_LIST_HEAD(&cset->tasks);
         INIT_LIST_HEAD(&cset->mg_tasks);
+       INIT_LIST_HEAD(&cset->mg_node);
         INIT_HLIST_NODE(&cset->hlist);
  
         /* Copy the set of subsystem state objects generated in
@@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
  }
  EXPORT_SYMBOL_GPL(task_cgroup_path);
  
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
-       struct task_struct      *task;
-       struct cgroup           *cgrp;
-       struct css_set          *cset;
-};
-
+/* used to track tasks and other necessary states during migration */
  struct cgroup_taskset {
-       struct task_and_cgroup  single;
-       struct flex_array       *tc_array;
-       int                     tc_array_len;
-       int                     idx;
+       /* the src and dst cset list running through cset->mg_node */
+       struct list_head        src_csets;
+       struct list_head        dst_csets;
+
+       /*
+        * Fields for cgroup_taskset_*() iteration.
+        *
+        * Before migration is committed, the target migration tasks are on
+        * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+        * the csets on ->dst_csets.  ->csets point to either ->src_csets
+        * or ->dst_csets depending on whether migration is committed.
+        *
+        * ->cur_csets and ->cur_task point to the current task position
+        * during iteration.
+        */
+       struct list_head        *csets;
+       struct css_set          *cur_cset;
+       struct task_struct      *cur_task;
  };
  
  /**
@@ -1663,12 +1669,10 @@ struct cgroup_taskset {
   */
  struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
  {
-       if (tset->tc_array) {
-               tset->idx = 0;
-               return cgroup_taskset_next(tset);
-       } else {
-               return tset->single.task;
-       }
+       tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+       tset->cur_task = NULL;
+
+       return cgroup_taskset_next(tset);
  }
  
  /**
@@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
   */
  struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
  {
-       struct task_and_cgroup *tc;
+       struct css_set *cset = tset->cur_cset;
+       struct task_struct *task = tset->cur_task;
  
-       if (!tset->tc_array || tset->idx >= tset->tc_array_len)
-               return NULL;
+       while (&cset->mg_node != tset->csets) {
+               if (!task)
+                       task = list_first_entry(&cset->mg_tasks,
+                                               struct task_struct, cg_list);
+               else
+                       task = list_next_entry(task, cg_list);
  
-       tc = flex_array_get(tset->tc_array, tset->idx++);
-       return tc->task;
+               if (&task->cg_list != &cset->mg_tasks) {
+                       tset->cur_cset = cset;
+                       tset->cur_task = task;
+                       return task;
+               }
+
+               cset = list_next_entry(cset, mg_node);
+               task = NULL;
+       }
+
+       return NULL;
  }
  
  /**
@@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
         WARN_ON_ONCE(tsk->flags & PF_EXITING);
         old_cset = task_css_set(tsk);
  
+       get_css_set(new_cset);
+
         task_lock(tsk);
         rcu_assign_pointer(tsk->cgroups, new_cset);
         task_unlock(tsk);
  
-       list_move(&tsk->cg_list, &new_cset->tasks);
+       list_move(&tsk->cg_list, &new_cset->mg_tasks);
  
         /*
          * We just gained a reference on old_cset by taking it from the
@@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
  static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
                               bool threadgroup)
  {
-       int ret, i, group_size;
-       struct cgroupfs_root *root = cgrp->root;
+       struct cgroup_taskset tset = {
+               .src_csets      = LIST_HEAD_INIT(tset.src_csets),
+               .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
+               .csets          = &tset.src_csets,
+       };
         struct cgroup_subsys_state *css, *failed_css = NULL;
-       /* threadgroup list cursor and array */
-       struct task_struct *task;
-       struct task_and_cgroup *tc;
-       struct flex_array *group;
-       struct cgroup_taskset tset = { };
-
-       /*
-        * step 0: in order to do expensive, possibly blocking operations for
-        * every thread, we cannot iterate the thread group list, since it needs
-        * rcu or tasklist locked. instead, build an array of all threads in the
-        * group - group_rwsem prevents new threads from appearing, and if
-        * threads exit, this will just be an over-estimate.
-        */
-       if (threadgroup)
-               group_size = get_nr_threads(leader);
-       else
-               group_size = 1;
-       /* flex_array supports very large thread-groups better than kmalloc. */
-       group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
-       if (!group)
-               return -ENOMEM;
-       /* pre-allocate to guarantee space while iterating in rcu read-side. */
-       ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
-       if (ret)
-               goto out_free_group_list;
+       struct css_set *cset, *tmp_cset;
+       struct task_struct *task, *tmp_task;
+       int i, ret;
  
-       i = 0;
         /*
          * Prevent freeing of tasks while we take a snapshot. Tasks that are
          * already PF_EXITING could be freed from underneath us unless we
          * take an rcu_read_lock.
          */
-       down_read(&css_set_rwsem);
+       down_write(&css_set_rwsem);
         rcu_read_lock();
         task = leader;
         do {
-               struct task_and_cgroup ent;
+               struct cgroup *src_cgrp;
  
                 /* @task either already exited or can't exit until the end */
                 if (task->flags & PF_EXITING)
                         goto next;
  
-               /* as per above, nr_threads may decrease, but not increase. */
-               BUG_ON(i >= group_size);
-               ent.task = task;
-               ent.cgrp = task_cgroup_from_root(task, root);
+               cset = task_css_set(task);
+               src_cgrp = task_cgroup_from_root(task, cgrp->root);
+
                 /* nothing to do if this task is already in the cgroup */
-               if (ent.cgrp == cgrp)
+               if (src_cgrp == cgrp)
                         goto next;
-               /*
-                * saying GFP_ATOMIC has no effect here because we did prealloc
-                * earlier, but it's good form to communicate our expectations.
-                */
-               ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
-               BUG_ON(ret != 0);
-               i++;
+
+               if (!cset->mg_src_cgrp) {
+                       WARN_ON(!list_empty(&cset->mg_tasks));
+                       WARN_ON(!list_empty(&cset->mg_node));
+
+                       cset->mg_src_cgrp = src_cgrp;
+                       list_add(&cset->mg_node, &tset.src_csets);
+                       get_css_set(cset);
+               }
+
+               list_move(&task->cg_list, &cset->mg_tasks);
         next:
                 if (!threadgroup)
                         break;
         } while_each_thread(leader, task);
         rcu_read_unlock();
-       up_read(&css_set_rwsem);
-       /* remember the number of threads in the array for later. */
-       group_size = i;
-       tset.tc_array = group;
-       tset.tc_array_len = group_size;
+       up_write(&css_set_rwsem);
  
         /* methods shouldn't be called if no task is actually migrating */
-       ret = 0;
-       if (!group_size)
-               goto out_free_group_list;
+       if (list_empty(&tset.src_csets))
+               return 0;
  
         /*
          * step 1: check that we can legitimately attach to the cgroup.
@@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
          * step 2: make sure css_sets exist for all threads to be migrated.
          * we use find_css_set, which allocates a new one if necessary.
          */
-       for (i = 0; i < group_size; i++) {
-               struct css_set *old_cset;
+       list_for_each_entry(cset, &tset.src_csets, mg_node) {
+               struct css_set *dst_cset;
  
-               tc = flex_array_get(group, i);
-               old_cset = task_css_set(tc->task);
-               tc->cset = find_css_set(old_cset, cgrp);
-               if (!tc->cset) {
+               dst_cset = find_css_set(cset, cgrp);
+               if (!dst_cset) {
                         ret = -ENOMEM;
-                       goto out_put_css_set_refs;
+                       goto out_release_tset;
                 }
+
+               if (list_empty(&dst_cset->mg_node))
+                       list_add(&dst_cset->mg_node, &tset.dst_csets);
+               else
+                       put_css_set(dst_cset, false);
+
+               cset->mg_dst_cset = dst_cset;
         }
  
         /*
@@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
          * failure cases after here, so this is the commit point.
          */
         down_write(&css_set_rwsem);
-       for (i = 0; i < group_size; i++) {
-               tc = flex_array_get(group, i);
-               cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
+       list_for_each_entry(cset, &tset.src_csets, mg_node) {
+               list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+                       cgroup_task_migrate(cset->mg_src_cgrp, task,
+                                           cset->mg_dst_cset);
         }
         up_write(&css_set_rwsem);
-       /* nothing is sensitive to fork() after this point. */
+
+       /* migration is committed, all target tasks are now on dst_csets */
+       tset.csets = &tset.dst_csets;
+
+       /* nothing is sensitive to fork() after this point */
  
         /*
          * step 4: do subsystem attach callbacks.
@@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
                 if (css->ss->attach)
                         css->ss->attach(css, &tset);
  
-       /*
-        * step 5: success! and cleanup
-        */
         ret = 0;
-out_put_css_set_refs:
-       if (ret) {
-               for (i = 0; i < group_size; i++) {
-                       tc = flex_array_get(group, i);
-                       if (!tc->cset)
-                               break;
-                       put_css_set(tc->cset, false);
-               }
-       }
+       goto out_release_tset;
+
  out_cancel_attach:
-       if (ret) {
-               for_each_css(css, i, cgrp) {
-                       if (css == failed_css)
-                               break;
-                       if (css->ss->cancel_attach)
-                               css->ss->cancel_attach(css, &tset);
-               }
+       for_each_css(css, i, cgrp) {
+               if (css == failed_css)
+                       break;
+               if (css->ss->cancel_attach)
+                       css->ss->cancel_attach(css, &tset);
         }
-out_free_group_list:
-       flex_array_free(group);
+out_release_tset:
+       down_write(&css_set_rwsem);
+       list_splice_init(&tset.dst_csets, &tset.src_csets);
+       list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+               list_splice_init(&cset->mg_tasks, &cset->tasks);
+               cset->mg_dst_cset = NULL;
+               cset->mg_src_cgrp = NULL;
+               list_del_init(&cset->mg_node);
+               put_css_set_locked(cset, false);
+       }
+       up_write(&css_set_rwsem);
         return ret;
  }
  
@@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void)
         atomic_set(&init_css_set.refcount, 1);
         INIT_LIST_HEAD(&init_css_set.cgrp_links);
         INIT_LIST_HEAD(&init_css_set.tasks);
+       INIT_LIST_HEAD(&init_css_set.mg_tasks);
+       INIT_LIST_HEAD(&init_css_set.mg_node);
         INIT_HLIST_NODE(&init_css_set.hlist);
         css_set_count = 1;
         init_cgroup_root(&cgroup_dummy_root);
author	Tejun Heo <tj@kernel.org>
	Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)
committer	Tejun Heo <tj@kernel.org>
	Tue, 25 Feb 2014 15:04:01 +0000 (10:04 -0500)
include/linux/cgroup.h		patch \| blob \| blame \| history
kernel/cgroup.c		patch \| blob \| blame \| history