cgroup: mark subsystems with broken hierarchy support and whine if cgroups are nested...
authorTejun Heo <tj@kernel.org>
Thu, 13 Sep 2012 19:20:58 +0000 (12:20 -0700)
committerTejun Heo <tj@kernel.org>
Fri, 14 Sep 2012 19:01:16 +0000 (12:01 -0700)
Currently, cgroup hierarchy support is a mess.  cpu related subsystems
behave correctly - configuration, accounting and control on a parent
properly cover its children.  blkio and freezer completely ignore
hierarchy and treat all cgroups as if they're directly under the root
cgroup.  Others show yet different behaviors.

These differing interpretations of cgroup hierarchy make using cgroup
confusing and it impossible to co-mount controllers into the same
hierarchy and obtain sane behavior.

Eventually, we want full hierarchy support from all subsystems and
probably a unified hierarchy.  Users using separate hierarchies
expecting completely different behaviors depending on the mounted
subsystem is deterimental to making any progress on this front.

This patch adds cgroup_subsys.broken_hierarchy and sets it to %true
for controllers which are lacking in hierarchy support.  The goal of
this patch is two-fold.

* Move users away from using hierarchy on currently non-hierarchical
  subsystems, so that implementing proper hierarchy support on those
  doesn't surprise them.

* Keep track of which controllers are broken how and nudge the
  subsystems to implement proper hierarchy support.

For now, start with a single warning message.  We can whine louder
later on.

v2: Fixed a typo spotted by Michal. Warning message updated.

v3: Updated memcg part so that it doesn't generate warning in the
    cases where .use_hierarchy=false doesn't make the behavior
    different from root.use_hierarchy=true.  Fixed a typo spotted by
    Glauber.

v4: Check ->broken_hierarchy after cgroup creation is complete so that
    ->create() can affect the result per Michal.  Dropped unnecessary
    memcg root handling per Michal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
block/blk-cgroup.c
include/linux/cgroup.h
kernel/cgroup.c
kernel/cgroup_freezer.c
kernel/events/core.c
mm/memcontrol.c
net/core/netprio_cgroup.c
net/sched/cls_cgroup.c
security/device_cgroup.c

index f3b44a65fc7ad5f127bee8bcbadf5b486a7e5c71..cafcd743118969daec377f52f09e41594d188347 100644 (file)
@@ -737,6 +737,14 @@ struct cgroup_subsys blkio_subsys = {
        .subsys_id = blkio_subsys_id,
        .base_cftypes = blkcg_files,
        .module = THIS_MODULE,
+
+       /*
+        * blkio subsystem is utterly broken in terms of hierarchy support.
+        * It treats all cgroups equally regardless of where they're
+        * located in the hierarchy - all cgroups are treated as if they're
+        * right below the root.  Fix it and remove the following.
+        */
+       .broken_hierarchy = true,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
index c90eaa80344017fd8a8e7555508a20949020f61f..68e8df70487edb9323aca100e12634a4fd5832fd 100644 (file)
@@ -496,6 +496,21 @@ struct cgroup_subsys {
         */
        bool __DEPRECATED_clear_css_refs;
 
+       /*
+        * If %false, this subsystem is properly hierarchical -
+        * configuration, resource accounting and restriction on a parent
+        * cgroup cover those of its children.  If %true, hierarchy support
+        * is broken in some ways - some subsystems ignore hierarchy
+        * completely while others are only implemented half-way.
+        *
+        * It's now disallowed to create nested cgroups if the subsystem is
+        * broken and cgroup core will emit a warning message on such
+        * cases.  Eventually, all subsystems will be made properly
+        * hierarchical and this will go away.
+        */
+       bool broken_hierarchy;
+       bool warned_broken_hierarchy;
+
 #define MAX_CGROUP_TYPE_NAMELEN 32
        const char *name;
 
index 79818507e444aa3050c9994841258292c294bd2e..b7d9606b17d75871ecb8f4a74390d89a044d10d8 100644 (file)
@@ -3954,8 +3954,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 
        for_each_subsys(root, ss) {
-               struct cgroup_subsys_state *css = ss->create(cgrp);
+               struct cgroup_subsys_state *css;
 
+               css = ss->create(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
@@ -3969,6 +3970,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                /* At error, ->destroy() callback has to free assigned ID. */
                if (clone_children(parent) && ss->post_clone)
                        ss->post_clone(cgrp);
+
+               if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+                   parent->parent) {
+                       pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+                                  current->comm, current->pid, ss->name);
+                       if (!strcmp(ss->name, "memory"))
+                               pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+                       ss->warned_broken_hierarchy = true;
+               }
        }
 
        list_add(&cgrp->sibling, &cgrp->parent->children);
index 3649fc6b3eaa9a2aa78b6a0ef2af73d1a9fd9775..b1724ce98981d25e76980193b7e3984c9f7717cd 100644 (file)
@@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {
        .can_attach     = freezer_can_attach,
        .fork           = freezer_fork,
        .base_cftypes   = files,
+
+       /*
+        * freezer subsys doesn't handle hierarchy at all.  Frozen state
+        * should be inherited through the hierarchy - if a parent is
+        * frozen, all its children should be frozen.  Fix it and remove
+        * the following.
+        */
+       .broken_hierarchy = true,
 };
index b7935fcec7d923b0b0b89fe0fe7dfdf61967b447..f18a0a56e5aa541dc194ebeac3b87e11b2fd1f27 100644 (file)
@@ -7285,5 +7285,12 @@ struct cgroup_subsys perf_subsys = {
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
+
+       /*
+        * perf_event cgroup doesn't handle nesting correctly.
+        * ctx->nr_cgroups adjustments should be propagated through the
+        * cgroup hierarchy.  Fix it and remove the following.
+        */
+       .broken_hierarchy = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
index 795e525afaba8914f3f8863de6a7299d84898ada..a72f2ffdc3d07b70925eaf2824c8a9ba7994c41c 100644 (file)
@@ -4973,6 +4973,13 @@ mem_cgroup_create(struct cgroup *cont)
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
+               /*
+                * Deeper hierachy with use_hierarchy == false doesn't make
+                * much sense so let cgroup subsystem know about this
+                * unfortunate state in our controller.
+                */
+               if (parent && parent != root_mem_cgroup)
+                       mem_cgroup_subsys.broken_hierarchy = true;
        }
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);
index c75e3f9d060f8e3d086b747255ab65c8104f7dde..34f3615b30ca81d49161f902e8bad47bc07770a6 100644 (file)
@@ -330,7 +330,17 @@ struct cgroup_subsys net_prio_subsys = {
        .subsys_id      = net_prio_subsys_id,
 #endif
        .base_cftypes   = ss_files,
-       .module         = THIS_MODULE
+       .module         = THIS_MODULE,
+
+       /*
+        * net_prio has artificial limit on the number of cgroups and
+        * disallows nesting making it impossible to co-mount it with other
+        * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ
+        * limit and properly nest configuration such that children follow
+        * their parents' configurations by default and are allowed to
+        * override and remove the following.
+        */
+       .broken_hierarchy = true,
 };
 
 static int netprio_device_event(struct notifier_block *unused,
index 7743ea8d1d387d920dcee43abe4a3bfeb8502bfd..907daf99ab2e1b502715305aee89568ad76e63c5 100644 (file)
@@ -82,6 +82,15 @@ struct cgroup_subsys net_cls_subsys = {
 #endif
        .base_cftypes   = ss_files,
        .module         = THIS_MODULE,
+
+       /*
+        * While net_cls cgroup has the rudimentary hierarchy support of
+        * inheriting the parent's classid on cgroup creation, it doesn't
+        * properly propagates config changes in ancestors to their
+        * descendents.  A child should follow the parent's configuration
+        * but be allowed to override it.  Fix it and remove the following.
+        */
+       .broken_hierarchy = true,
 };
 
 struct cls_cgroup_head {
index 442204cc22d91772251043f3a051d6cce93bcfc7..4b877a92a7ea3dc3a0307f5c5efb9e78c3289b17 100644 (file)
@@ -457,6 +457,15 @@ struct cgroup_subsys devices_subsys = {
        .destroy = devcgroup_destroy,
        .subsys_id = devices_subsys_id,
        .base_cftypes = dev_cgroup_files,
+
+       /*
+        * While devices cgroup has the rudimentary hierarchy support which
+        * checks the parent's restriction, it doesn't properly propagates
+        * config changes in ancestors to their descendents.  A child
+        * should only be allowed to add more restrictions to the parent's
+        * configuration.  Fix it and remove the following.
+        */
+       .broken_hierarchy = true,
 };
 
 int __devcgroup_inode_permission(struct inode *inode, int mask)