writeback: fix incorrect calculation of available memory for memcg domains
authorTejun Heo <tj@kernel.org>
Tue, 29 Sep 2015 17:04:26 +0000 (13:04 -0400)
committerJens Axboe <axboe@fb.com>
Mon, 12 Oct 2015 16:31:13 +0000 (10:31 -0600)
For memcg domains, the amount of available memory was calculated as

 min(the amount currently in use + headroom according to memcg,
     total clean memory)

This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom.  For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense.  In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.

As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is

 the amount currently in use +
 min(headroom according to memcg, clean memory elsewhere in the system)

This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.

v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
    to build failure when !CGROUP_WRITEBACK.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
include/linux/memcontrol.h
mm/memcontrol.c
mm/page-writeback.c

index 6452ff4c463fd8715edc9a5043bc812521a18d6a..3e3318ddfc0e3e09a0e15825f78eb6052d628d78 100644 (file)
@@ -676,8 +676,9 @@ enum {
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-                        unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+                        unsigned long *pheadroom, unsigned long *pdirty,
+                        unsigned long *pwriteback);
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
@@ -687,7 +688,8 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 }
 
 static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
-                                      unsigned long *pavail,
+                                      unsigned long *pfilepages,
+                                      unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
 {
index 1fedbde68f595c2b83d5aa84962a667c1ada120a..882c10cfd0ba4037faab3e071169ed8f08d669d7 100644 (file)
@@ -3740,44 +3740,43 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
  * @pdirty: out parameter for number of dirty pages
  * @pwriteback: out parameter for number of pages under writeback
  *
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ * is a bit more involved.
  *
- * A memcg's headroom is "min(max, high) - used".  The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages.  Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors.  Note that this doesn't consider the actual amount of
+ * available memory in the system.  The caller should further cap
+ * *@pheadroom accordingly.
  */
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-                        unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+                        unsigned long *pheadroom, unsigned long *pdirty,
+                        unsigned long *pwriteback)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
-       unsigned long head_room = PAGE_COUNTER_MAX;
-       unsigned long file_pages;
 
        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+       *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                    (1 << LRU_ACTIVE_FILE));
+       *pheadroom = PAGE_COUNTER_MAX;
 
-       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
-                                                   (1 << LRU_ACTIVE_FILE));
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                unsigned long used = page_counter_read(&memcg->memory);
 
-               head_room = min(head_room, ceiling - min(ceiling, used));
+               *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                memcg = parent;
        }
-
-       *pavail = file_pages + head_room;
 }
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
index 56c0bffa9f49f6e1c7a84537bc781ce71697d42b..2c90357c34ea4c4d81521b7bf14d669d6565cc07 100644 (file)
@@ -684,13 +684,19 @@ static unsigned long hard_dirty_limit(struct wb_domain *dom,
        return max(thresh, dom->dirty_limit);
 }
 
-/* memory available to a memcg domain is capped by system-wide clean memory */
-static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+                           unsigned long filepages, unsigned long headroom)
 {
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
-       unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+       unsigned long clean = filepages - min(filepages, mdtc->dirty);
+       unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+       unsigned long other_clean = global_clean - min(global_clean, clean);
 
-       mdtc->avail = min(mdtc->avail, clean);
+       mdtc->avail = filepages + min(headroom, other_clean);
 }
 
 /**
@@ -1564,16 +1570,16 @@ static void balance_dirty_pages(struct address_space *mapping,
                }
 
                if (mdtc) {
-                       unsigned long writeback;
+                       unsigned long filepages, headroom, writeback;
 
                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
-                       mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
-                                           &writeback);
-                       mdtc_cap_avail(mdtc);
+                       mem_cgroup_wb_stats(wb, &filepages, &headroom,
+                                           &mdtc->dirty, &writeback);
                        mdtc->dirty += writeback;
+                       mdtc_calc_avail(mdtc, filepages, headroom);
 
                        domain_dirty_limits(mdtc);
 
@@ -1895,10 +1901,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
                return true;
 
        if (mdtc) {
-               unsigned long writeback;
+               unsigned long filepages, headroom, writeback;
 
-               mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
-               mdtc_cap_avail(mdtc);
+               mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+                                   &writeback);
+               mdtc_calc_avail(mdtc, filepages, headroom);
                domain_dirty_limits(mdtc);      /* ditto, ignore writeback */
 
                if (mdtc->dirty > mdtc->bg_thresh)