mm, oom: do not rely on TIF_MEMDIE for memory reserves access

author Michal Hocko <mhocko@suse.com>

Wed, 6 Sep 2017 23:24:50 +0000 (16:24 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
author Michal Hocko <mhocko@suse.com>
Wed, 6 Sep 2017 23:24:50 +0000 (16:24 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
diff --git a/mm/internal.h b/mm/internal.h

index 781c0d54d75a6ab2f9f76dd51ab4185b624cde47..1df011f624801ffbdf3379a6d17218d9b32280e1 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  /* Mask to get the watermark bits */
  #define ALLOC_WMARK_MASK       (ALLOC_NO_WATERMARKS-1)
  
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM              0x08
+#else
+#define ALLOC_OOM              ALLOC_NO_WATERMARKS
+#endif
+
  #define ALLOC_HARDER           0x10 /* try to alloc harder */
  #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 9e8b4f030c1c43cb92da706306e1b9390658af7b..c9f3569a76c796bc051349be9e5f5c868accdd8e 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -824,7 +824,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
  
         /*
          * If the task is already exiting, don't alarm the sysadmin or kill
-        * its children or threads, just set TIF_MEMDIE so it can die quickly
+        * its children or threads, just give it access to memory reserves
+        * so it can die quickly
          */
         task_lock(p);
         if (task_will_free_mem(p)) {
@@ -889,9 +890,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
         count_memcg_event_mm(mm, OOM_KILL);
  
         /*
-        * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
-        * the OOM victim from depleting the memory reserves from the user
-        * space under its control.
+        * We should send SIGKILL before granting access to memory reserves
+        * in order to prevent the OOM victim from depleting the memory
+        * reserves from the user space under its control.
          */
         do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
         mark_oom_victim(victim);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index a4562c058ec4d00a53db2f4647a56b99f73bf3cc..a9add06fe7687eba0812a112f5aa213693401d9d 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  {
         long min = mark;
         int o;
-       const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+       const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
  
         /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
          * the high-atomic reserves. This will over-estimate the size of the
          * atomic reserve but it avoids a search.
          */
-       if (likely(!alloc_harder))
+       if (likely(!alloc_harder)) {
                 free_pages -= z->nr_reserved_highatomic;
-       else
-               min -= min / 4;
+       } else {
+               /*
+                * OOM victims can try even harder than normal ALLOC_HARDER
+                * users on the grounds that it's definitely going to be in
+                * the exit path shortly and free memory. Any allocation it
+                * makes during the free path will be small and short-lived.
+                */
+               if (alloc_flags & ALLOC_OOM)
+                       min -= min / 2;
+               else
+                       min -= min / 4;
+       }
+
  
  #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
          * of allowed nodes.
          */
         if (!(gfp_mask & __GFP_NOMEMALLOC))
-               if (test_thread_flag(TIF_MEMDIE) ||
+               if (tsk_is_oom_victim(current) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         return alloc_flags;
  }
  
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
  {
-       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+       if (!tsk_is_oom_victim(tsk))
+               return false;
+
+       /*
+        * !MMU doesn't have oom reaper so give access to memory reserves
+        * only to the thread with TIF_MEMDIE set
+        */
+       if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
                 return false;
  
+       return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+               return 0;
         if (gfp_mask & __GFP_MEMALLOC)
-               return true;
+               return ALLOC_NO_WATERMARKS;
         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-               return true;
-       if (!in_interrupt() &&
-                       ((current->flags & PF_MEMALLOC) ||
-                        unlikely(test_thread_flag(TIF_MEMDIE))))
-               return true;
+               return ALLOC_NO_WATERMARKS;
+       if (!in_interrupt()) {
+               if (current->flags & PF_MEMALLOC)
+                       return ALLOC_NO_WATERMARKS;
+               else if (oom_reserves_allowed(current))
+                       return ALLOC_OOM;
+       }
  
-       return false;
+       return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+       return !!__gfp_pfmemalloc_flags(gfp_mask);
  }
  
  /*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long alloc_start = jiffies;
         unsigned int stall_timeout = 10 * HZ;
         unsigned int cpuset_mems_cookie;
+       int reserve_flags;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
-       if (gfp_pfmemalloc_allowed(gfp_mask))
-               alloc_flags = ALLOC_NO_WATERMARKS;
+       reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+       if (reserve_flags)
+               alloc_flags = reserve_flags;
  
         /*
          * Reset the zonelist iterators if memory policies can be ignored.
          * These allocations are high priority and system rather than user
          * orientated.
          */
-       if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+       if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
                 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                         ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
                 goto got_pg;
  
         /* Avoid allocations with no watermarks from looping endlessly */
-       if (test_thread_flag(TIF_MEMDIE) &&
-           (alloc_flags == ALLOC_NO_WATERMARKS ||
+       if (tsk_is_oom_victim(current) &&
+           (alloc_flags == ALLOC_OOM ||
              (gfp_mask & __GFP_NOMEMALLOC)))
                 goto nopage;
author	Michal Hocko <mhocko@suse.com>
	Wed, 6 Sep 2017 23:24:50 +0000 (16:24 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
mm/internal.h		patch \| blob \| blame \| history
mm/oom_kill.c		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history