#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
#include <linux/delay.h>
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_node);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
- struct task_struct *task;
- struct cgroup *cgrp;
- struct css_set *cset;
-};
-
+/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
- struct task_and_cgroup single;
- struct flex_array *tc_array;
- int tc_array_len;
- int idx;
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
/**
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- if (tset->tc_array) {
- tset->idx = 0;
- return cgroup_taskset_next(tset);
- } else {
- return tset->single.task;
- }
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
/**
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- struct task_and_cgroup *tc;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
- if (!tset->tc_array || tset->idx >= tset->tc_array_len)
- return NULL;
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
- tc = flex_array_get(tset->tc_array, tset->idx++);
- return tc->task;
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
+
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
}
/**
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
+ get_css_set(new_cset);
+
task_lock(tsk);
rcu_assign_pointer(tsk->cgroups, new_cset);
task_unlock(tsk);
- list_move(&tsk->cg_list, &new_cset->tasks);
+ list_move(&tsk->cg_list, &new_cset->mg_tasks);
/*
* We just gained a reference on old_cset by taking it from the
static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
bool threadgroup)
{
- int ret, i, group_size;
- struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
struct cgroup_subsys_state *css, *failed_css = NULL;
- /* threadgroup list cursor and array */
- struct task_struct *task;
- struct task_and_cgroup *tc;
- struct flex_array *group;
- struct cgroup_taskset tset = { };
-
- /*
- * step 0: in order to do expensive, possibly blocking operations for
- * every thread, we cannot iterate the thread group list, since it needs
- * rcu or tasklist locked. instead, build an array of all threads in the
- * group - group_rwsem prevents new threads from appearing, and if
- * threads exit, this will just be an over-estimate.
- */
- if (threadgroup)
- group_size = get_nr_threads(leader);
- else
- group_size = 1;
- /* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
- if (!group)
- return -ENOMEM;
- /* pre-allocate to guarantee space while iterating in rcu read-side. */
- ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
- if (ret)
- goto out_free_group_list;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
- i = 0;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- down_read(&css_set_rwsem);
+ down_write(&css_set_rwsem);
rcu_read_lock();
task = leader;
do {
- struct task_and_cgroup ent;
+ struct cgroup *src_cgrp;
/* @task either already exited or can't exit until the end */
if (task->flags & PF_EXITING)
goto next;
- /* as per above, nr_threads may decrease, but not increase. */
- BUG_ON(i >= group_size);
- ent.task = task;
- ent.cgrp = task_cgroup_from_root(task, root);
+ cset = task_css_set(task);
+ src_cgrp = task_cgroup_from_root(task, cgrp->root);
+
/* nothing to do if this task is already in the cgroup */
- if (ent.cgrp == cgrp)
+ if (src_cgrp == cgrp)
goto next;
- /*
- * saying GFP_ATOMIC has no effect here because we did prealloc
- * earlier, but it's good form to communicate our expectations.
- */
- ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(ret != 0);
- i++;
+
+ if (!cset->mg_src_cgrp) {
+ WARN_ON(!list_empty(&cset->mg_tasks));
+ WARN_ON(!list_empty(&cset->mg_node));
+
+ cset->mg_src_cgrp = src_cgrp;
+ list_add(&cset->mg_node, &tset.src_csets);
+ get_css_set(cset);
+ }
+
+ list_move(&task->cg_list, &cset->mg_tasks);
next:
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_read(&css_set_rwsem);
- /* remember the number of threads in the array for later. */
- group_size = i;
- tset.tc_array = group;
- tset.tc_array_len = group_size;
+ up_write(&css_set_rwsem);
/* methods shouldn't be called if no task is actually migrating */
- ret = 0;
- if (!group_size)
- goto out_free_group_list;
+ if (list_empty(&tset.src_csets))
+ return 0;
/*
* step 1: check that we can legitimately attach to the cgroup.
* step 2: make sure css_sets exist for all threads to be migrated.
* we use find_css_set, which allocates a new one if necessary.
*/
- for (i = 0; i < group_size; i++) {
- struct css_set *old_cset;
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ struct css_set *dst_cset;
- tc = flex_array_get(group, i);
- old_cset = task_css_set(tc->task);
- tc->cset = find_css_set(old_cset, cgrp);
- if (!tc->cset) {
+ dst_cset = find_css_set(cset, cgrp);
+ if (!dst_cset) {
ret = -ENOMEM;
- goto out_put_css_set_refs;
+ goto out_release_tset;
}
+
+ if (list_empty(&dst_cset->mg_node))
+ list_add(&dst_cset->mg_node, &tset.dst_csets);
+ else
+ put_css_set(dst_cset, false);
+
+ cset->mg_dst_cset = dst_cset;
}
/*
* failure cases after here, so this is the commit point.
*/
down_write(&css_set_rwsem);
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
up_write(&css_set_rwsem);
- /* nothing is sensitive to fork() after this point. */
+
+ /* migration is committed, all target tasks are now on dst_csets */
+ tset.csets = &tset.dst_csets;
+
+ /* nothing is sensitive to fork() after this point */
/*
* step 4: do subsystem attach callbacks.
if (css->ss->attach)
css->ss->attach(css, &tset);
- /*
- * step 5: success! and cleanup
- */
ret = 0;
-out_put_css_set_refs:
- if (ret) {
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- if (!tc->cset)
- break;
- put_css_set(tc->cset, false);
- }
- }
+ goto out_release_tset;
+
out_cancel_attach:
- if (ret) {
- for_each_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
+ for_each_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
-out_free_group_list:
- flex_array_free(group);
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_init(&cset->mg_tasks, &cset->tasks);
+ cset->mg_dst_cset = NULL;
+ cset->mg_src_cgrp = NULL;
+ list_del_init(&cset->mg_node);
+ put_css_set_locked(cset, false);
+ }
+ up_write(&css_set_rwsem);
return ret;
}
atomic_set(&init_css_set.refcount, 1);
INIT_LIST_HEAD(&init_css_set.cgrp_links);
INIT_LIST_HEAD(&init_css_set.tasks);
+ INIT_LIST_HEAD(&init_css_set.mg_tasks);
+ INIT_LIST_HEAD(&init_css_set.mg_node);
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&cgroup_dummy_root);