mm: fix 100% CPU kswapd busyloop on unreclaimable nodes

author Johannes Weiner <hannes@cmpxchg.org>

Wed, 3 May 2017 21:51:51 +0000 (14:51 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 May 2017 22:52:07 +0000 (15:52 -0700)
author Johannes Weiner <hannes@cmpxchg.org>
Wed, 3 May 2017 21:51:51 +0000 (14:51 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 May 2017 22:52:07 +0000 (15:52 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 8e02b3750fe0f6e18afeb8cfc096705f23405728..d2c50ab6ae40e579a8fb7f7a32e8442e16440f3f 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -630,6 +630,8 @@ typedef struct pglist_data {
         int kswapd_order;
         enum zone_type kswapd_classzone_idx;
  
+       int kswapd_failures;            /* Number of 'reclaimed == 0' runs */
+
  #ifdef CONFIG_COMPACTION
         int kcompactd_max_order;
         enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h

index 266efaeaa370a46debcc5b6b614a72e33833ac4d..e5a0e0ec2177bc661e785021a604efe9bd691a44 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -80,6 +80,12 @@ static inline void set_page_refcounted(struct page *page)
  
  extern unsigned long highest_memmap_pfn;
  
+/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
  /*
   * in mm/vmscan.c:
   */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index bd01501efab9141f9071dc741abf17719de7b027..42c0543e46c3f2ba5eb206f6767cf26d08799f92 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3521,12 +3521,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         return false;
  }
  
-/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
  /*
   * Checks whether it makes sense to retry the reclaim to make a forward progress
   * for the given allocation request.
@@ -4534,7 +4528,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                         K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
                         node_page_state(pgdat, NR_PAGES_SCANNED),
-                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+                       pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+                               "yes" : "no");
         }
  
         for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c

index bc8031ef994d57a1d1622468f8df6d745853562b..667644e53b5c488ffa2c8e8bb90e7d0bc3bd5510 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
         return reclaimable;
  }
  
@@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                  GFP_KERNEL | __GFP_HARDWALL))
                                 continue;
  
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                         /*
                          * If we already have plenty of memory free for
                          * compaction in this zone, don't free any more.
@@ -2817,7 +2822,7 @@ retry:
         return 0;
  }
  
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
  {
         struct zone *zone;
         unsigned long pfmemalloc_reserve = 0;
@@ -2825,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
         int i;
         bool wmark_ok;
  
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
                 if (!managed_zone(zone) ||
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  
                 /* Throttle based on the first usable node */
                 pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                         goto out;
                 break;
         }
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          */
         if (!(gfp_mask & __GFP_FS)) {
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
  
                 goto check_pending;
         }
  
         /* Throttle until kswapd wakes the process */
         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
  
  check_pending:
         if (fatal_signal_pending(current))
@@ -3114,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
          * race between when kswapd checks the watermarks and a process gets
          * throttled. There is also a potential race if processes get
          * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,6 +3136,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
@@ -3214,9 +3226,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         count_vm_event(PAGEOUTRUN);
  
         do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
  
-               sc.nr_reclaimed = 0;
                 sc.reclaim_idx = classzone_idx;
  
                 /*
@@ -3295,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * able to safely make forward progress. Wake them
                  */
                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
                 /* Check if kswapd should be suspending */
@@ -3306,10 +3318,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
                  */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
  
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
  out:
         /*
          * Return the order kswapd stopped reclaiming at as
@@ -3509,6 +3525,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
+
         /* Only wake kswapd if all zones are unbalanced */
         for (z = 0; z <= classzone_idx; z++) {
                 zone = pgdat->node_zones + z;
@@ -3779,9 +3799,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
             sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                 return NODE_RECLAIM_FULL;
  
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
         /*
          * Do not scan if the allocation should not be delayed.
          */
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 5a4f5c5a31e88ee558f536d22f61f05a3fd13c45..baee70dafba898f4a8d4387460d29eaf66a3ddb0 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                    "\n  node_unreclaimable:  %u"
                    "\n  start_pfn:           %lu"
                    "\n  node_inactive_ratio: %u",
-                  !pgdat_reclaimable(zone->zone_pgdat),
+                  pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
                    zone->zone_start_pfn,
                    zone->zone_pgdat->inactive_ratio);
         seq_putc(m, '\n');
author	Johannes Weiner <hannes@cmpxchg.org>
	Wed, 3 May 2017 21:51:51 +0000 (14:51 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 May 2017 22:52:07 +0000 (15:52 -0700)
include/linux/mmzone.h		patch \| blob \| blame \| history
mm/internal.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history
mm/vmstat.c		patch \| blob \| blame \| history