mm, vmstat: remove zone and node double accounting by approximating retries
authorMel Gorman <mgorman@techsingularity.net>
Thu, 28 Jul 2016 22:47:05 +0000 (15:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Jul 2016 23:07:41 +0000 (16:07 -0700)
The number of LRU pages, dirty pages and writeback pages must be
accounted for on both zones and nodes because of the reclaim retry
logic, compaction retry logic and highmem calculations all depending on
per-zone stats.

Many lowmem allocations are immune from OOM kill due to a check in
__alloc_pages_may_oom for (ac->high_zoneidx < ZONE_NORMAL) since commit
03668b3ceb0c ("oom: avoid oom killer for lowmem allocations").  The
exception is costly high-order allocations or allocations that cannot
fail.  If the __alloc_pages_may_oom avoids OOM-kill for low-order lowmem
allocations then it would fall through to __alloc_pages_direct_compact.

This patch will blindly retry reclaim for zone-constrained allocations
in should_reclaim_retry up to MAX_RECLAIM_RETRIES.  This is not ideal
but without per-zone stats there are not many alternatives.  The impact
it that zone-constrained allocations may delay before considering the
OOM killer.

As there is no guarantee enough memory can ever be freed to satisfy
compaction, this patch avoids retrying compaction for zone-contrained
allocations.

In combination, that means that the per-node stats can be used when
deciding whether to continue reclaim using a rough approximation.  While
it is possible this will make the wrong decision on occasion, it will
not infinite loop as the number of reclaim attempts is capped by
MAX_RECLAIM_RETRIES.

The final step is calculating the number of dirtyable highmem pages.  As
those calculations only care about the global count of file pages in
highmem.  This patch uses a global counter used instead of per-zone
stats as it is sufficient.

In combination, this allows the per-zone LRU and dirty state counters to
be removed.

[mgorman@techsingularity.net: fix acct_highmem_file_pages()]
Link: http://lkml.kernel.org/r/1468853426-12858-4-git-send-email-mgorman@techsingularity.netLink:
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Suggested by: Michal Hocko <mhocko@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/mm_inline.h
include/linux/mmzone.h
include/linux/swap.h
mm/compaction.c
mm/migrate.c
mm/page-writeback.c
mm/page_alloc.c
mm/vmscan.c
mm/vmstat.c

index 9aadcc7818575236c7d457ff9c664993801a4ff9..dd22b08c47be657dbf2ec5ddf18dd64f60227d95 100644 (file)
@@ -4,6 +4,22 @@
 #include <linux/huge_mm.h>
 #include <linux/swap.h>
 
+#ifdef CONFIG_HIGHMEM
+extern atomic_t highmem_file_pages;
+
+static inline void acct_highmem_file_pages(int zid, enum lru_list lru,
+                                                       int nr_pages)
+{
+       if (is_highmem_idx(zid) && is_file_lru(lru))
+               atomic_add(nr_pages, &highmem_file_pages);
+}
+#else
+static inline void acct_highmem_file_pages(int zid, enum lru_list lru,
+                                                       int nr_pages)
+{
+}
+#endif
+
 /**
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
@@ -29,9 +45,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
        __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
-       __mod_zone_page_state(&pgdat->node_zones[zid],
-               NR_ZONE_LRU_BASE + !!is_file_lru(lru),
-               nr_pages);
+       acct_highmem_file_pages(zid, lru, nr_pages);
 }
 
 static __always_inline void update_lru_size(struct lruvec *lruvec,
index bd33e6f1bed0c4774542f451709e20a411798d6e..a3b7f45aac5612509d241937715ef5e4893d0729 100644 (file)
@@ -110,10 +110,6 @@ struct zone_padding {
 enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
-       NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
-       NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE,
-       NR_ZONE_LRU_FILE,
-       NR_ZONE_WRITE_PENDING,  /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,               /* mlock()ed pages found and moved off LRU */
        NR_SLAB_RECLAIMABLE,
        NR_SLAB_UNRECLAIMABLE,
index b17cc4830fa670512abdc3987f58fc55e583f0bc..cc753c639e3ddab6c89ea1aff7a29743a1f9a406 100644 (file)
@@ -307,7 +307,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
                                                struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);
index e5995f38d6773ee98c46863cbd077443cb6548d2..cd93ea24c565c17206ff56817a25b6929a94d632 100644 (file)
@@ -1438,6 +1438,11 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
 {
        struct zone *zone;
        struct zoneref *z;
+       pg_data_t *last_pgdat = NULL;
+
+       /* Do not retry compaction for zone-constrained allocations */
+       if (ac->high_zoneidx < ZONE_NORMAL)
+               return false;
 
        /*
         * Make sure at least one zone would pass __compaction_suitable if we continue
@@ -1448,14 +1453,27 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
                unsigned long available;
                enum compact_result compact_result;
 
+               if (last_pgdat == zone->zone_pgdat)
+                       continue;
+
+               /*
+                * This over-estimates the number of pages available for
+                * reclaim/compaction but walking the LRU would take too
+                * long. The consequences are that compaction may retry
+                * longer than it should for a zone-constrained allocation
+                * request.
+                */
+               last_pgdat = zone->zone_pgdat;
+               available = pgdat_reclaimable_pages(zone->zone_pgdat) / order;
+
                /*
                 * Do not consider all the reclaimable memory because we do not
                 * want to trash just for a single high order allocation which
                 * is even not guaranteed to appear even if __compaction_suitable
                 * is happy about the watermark check.
                 */
-               available = zone_reclaimable_pages(zone) / order;
                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+               available = min(zone->managed_pages, available);
                compact_result = __compaction_suitable(zone, order, alloc_flags,
                                ac_classzone_idx(ac), available);
                if (compact_result != COMPACT_SKIPPED &&
index ed0268268e93f5eac48c02be4737cea269c0fa5a..ed2f85e61de13ee825140aadb0270a6bef8f09a8 100644 (file)
@@ -513,9 +513,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
                }
                if (dirty && mapping_cap_account_dirty(mapping)) {
                        __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
-                       __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
                        __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
-                       __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
                }
        }
        local_irq_enable();
index 3c02aa603f5a1a2344490a580d593f3fb0953d29..0bca2376bd42831ebb23b0f084dcbcb16e601d99 100644 (file)
@@ -299,6 +299,9 @@ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
 
        return nr_pages;
 }
+#ifdef CONFIG_HIGHMEM
+atomic_t highmem_file_pages;
+#endif
 
 static unsigned long highmem_dirtyable_memory(unsigned long total)
 {
@@ -306,18 +309,17 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
        int node;
        unsigned long x = 0;
        int i;
+       unsigned long dirtyable = atomic_read(&highmem_file_pages);
 
        for_each_node_state(node, N_HIGH_MEMORY) {
                for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
                        struct zone *z;
-                       unsigned long dirtyable;
 
                        if (!is_highmem_idx(i))
                                continue;
 
                        z = &NODE_DATA(node)->node_zones[i];
-                       dirtyable = zone_page_state(z, NR_FREE_PAGES) +
-                               zone_page_state(z, NR_ZONE_LRU_FILE);
+                       dirtyable += zone_page_state(z, NR_FREE_PAGES);
 
                        /* watch for underflows */
                        dirtyable -= min(dirtyable, high_wmark_pages(z));
@@ -2460,7 +2462,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 
                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                __inc_node_page_state(page, NR_FILE_DIRTY);
-               __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                __inc_node_page_state(page, NR_DIRTIED);
                __inc_wb_stat(wb, WB_RECLAIMABLE);
                __inc_wb_stat(wb, WB_DIRTIED);
@@ -2482,7 +2483,6 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
        if (mapping_cap_account_dirty(mapping)) {
                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                dec_node_page_state(page, NR_FILE_DIRTY);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_SIZE);
        }
@@ -2739,7 +2739,6 @@ int clear_page_dirty_for_io(struct page *page)
                if (TestClearPageDirty(page)) {
                        mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
                        dec_node_page_state(page, NR_FILE_DIRTY);
-                       dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                        dec_wb_stat(wb, WB_RECLAIMABLE);
                        ret = 1;
                }
@@ -2786,7 +2785,6 @@ int test_clear_page_writeback(struct page *page)
        if (ret) {
                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
                dec_node_page_state(page, NR_WRITEBACK);
-               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                inc_node_page_state(page, NR_WRITTEN);
        }
        unlock_page_memcg(page);
@@ -2841,7 +2839,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
        if (!ret) {
                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
                inc_node_page_state(page, NR_WRITEBACK);
-               inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
        }
        unlock_page_memcg(page);
        return ret;
index 03e67f2dfdaa25a4d3dc967a4ccc01087a55f59c..f1b5a0bc11f27edda7b74dac8bca2a4a4c8af985 100644 (file)
@@ -3402,6 +3402,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 {
        struct zone *zone;
        struct zoneref *z;
+       pg_data_t *current_pgdat = NULL;
 
        /*
         * Make sure we converge to OOM if we cannot make any progress
@@ -3411,27 +3412,56 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                return false;
 
        /*
-        * Keep reclaiming pages while there is a chance this will lead somewhere.
-        * If none of the target zones can satisfy our allocation request even
-        * if all reclaimable pages are considered then we are screwed and have
-        * to go OOM.
+        * Blindly retry lowmem allocation requests that are often ignored by
+        * the OOM killer up to MAX_RECLAIM_RETRIES as we not have a reliable
+        * and fast means of calculating reclaimable, dirty and writeback pages
+        * in eligible zones.
+        */
+       if (ac->high_zoneidx < ZONE_NORMAL)
+               goto out;
+
+       /*
+        * Keep reclaiming pages while there is a chance this will lead
+        * somewhere.  If none of the target zones can satisfy our allocation
+        * request even if all reclaimable pages are considered then we are
+        * screwed and have to go OOM.
         */
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                        ac->nodemask) {
                unsigned long available;
                unsigned long reclaimable;
+               int zid;
 
-               available = reclaimable = zone_reclaimable_pages(zone);
+               if (current_pgdat == zone->zone_pgdat)
+                       continue;
+
+               current_pgdat = zone->zone_pgdat;
+               available = reclaimable = pgdat_reclaimable_pages(current_pgdat);
                available -= DIV_ROUND_UP(no_progress_loops * available,
                                          MAX_RECLAIM_RETRIES);
-               available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+               /* Account for all free pages on eligible zones */
+               for (zid = 0; zid <= zone_idx(zone); zid++) {
+                       struct zone *acct_zone = &current_pgdat->node_zones[zid];
+
+                       available += zone_page_state_snapshot(acct_zone, NR_FREE_PAGES);
+               }
 
                /*
                 * Would the allocation succeed if we reclaimed the whole
-                * available?
+                * available? This is approximate because there is no
+                * accurate count of reclaimable pages per zone.
                 */
-               if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
-                               ac_classzone_idx(ac), alloc_flags, available)) {
+               for (zid = 0; zid <= zone_idx(zone); zid++) {
+                       struct zone *check_zone = &current_pgdat->node_zones[zid];
+                       unsigned long estimate;
+
+                       estimate = min(check_zone->managed_pages, available);
+                       if (!__zone_watermark_ok(check_zone, order,
+                                       min_wmark_pages(check_zone), ac_classzone_idx(ac),
+                                       alloc_flags, estimate))
+                               continue;
+
                        /*
                         * If we didn't make any progress and have a lot of
                         * dirty + writeback pages then we should wait for
@@ -3441,15 +3471,16 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                        if (!did_some_progress) {
                                unsigned long write_pending;
 
-                               write_pending = zone_page_state_snapshot(zone,
-                                                       NR_ZONE_WRITE_PENDING);
+                               write_pending =
+                                       node_page_state(current_pgdat, NR_WRITEBACK) +
+                                       node_page_state(current_pgdat, NR_FILE_DIRTY);
 
                                if (2 * write_pending > reclaimable) {
                                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                                        return true;
                                }
                        }
-
+out:
                        /*
                         * Memory allocation/reclaim might be called from a WQ
                         * context and the current implementation of the WQ
index d5ee6d998b5e68b734f5fb67564c4aaae5389ccb..5625eccc014002447dea8c79084f74c5d85f556b 100644 (file)
@@ -194,22 +194,6 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
-/*
- * This misses isolated pages which are not accounted for to save counters.
- * As the data only determines if reclaim or compaction continues, it is
- * not expected that isolated pages will be a dominating factor.
- */
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
-       unsigned long nr;
-
-       nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE);
-       if (get_nr_swap_pages() > 0)
-               nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON);
-
-       return nr;
-}
-
 unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
 {
        unsigned long nr;
index ac509572a50b6adebe64e611dbb8171c37858949..91ecca96dcaed727c2fc5608be7cb53dc4aec7e3 100644 (file)
@@ -921,9 +921,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 const char * const vmstat_text[] = {
        /* enum zone_stat_item countes */
        "nr_free_pages",
-       "nr_zone_anon_lru",
-       "nr_zone_file_lru",
-       "nr_zone_write_pending",
        "nr_mlock",
        "nr_slab_reclaimable",
        "nr_slab_unreclaimable",