mm, oom: fortify task_will_free_mem()
authorMichal Hocko <mhocko@suse.com>
Thu, 28 Jul 2016 22:44:52 +0000 (15:44 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Jul 2016 23:07:41 +0000 (16:07 -0700)
task_will_free_mem is rather weak.  It doesn't really tell whether the
task has chance to drop its mm.  98748bd72200 ("oom: consider
multi-threaded tasks in task_will_free_mem") made a first step into making
it more robust for multi-threaded applications so now we know that the
whole process is going down and probably drop the mm.

This patch builds on top for more complex scenarios where mm is shared
between different processes - CLONE_VM without CLONE_SIGHAND, or in kernel
use_mm().

Make sure that all processes sharing the mm are killed or exiting.  This
will allow us to replace try_oom_reaper by wake_oom_reaper because
task_will_free_mem implies the task is reapable now.  Therefore all paths
which bypass the oom killer are now reapable and so they shouldn't lock up
the oom killer.

Link: http://lkml.kernel.org/r/1466426628-15074-8-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/oom.h
mm/memcontrol.c
mm/oom_kill.c

index 606137b3b778e224291ba89e2df3d87277134aa8..5bc0457ee3a88955f64b750a06858d45cb74b5da 100644 (file)
@@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
 extern void mark_oom_victim(struct task_struct *tsk);
 
 #ifdef CONFIG_MMU
-extern void try_oom_reaper(struct task_struct *tsk);
+extern void wake_oom_reaper(struct task_struct *tsk);
 #else
-static inline void try_oom_reaper(struct task_struct *tsk)
+static inline void wake_oom_reaper(struct task_struct *tsk)
 {
 }
 #endif
@@ -107,27 +107,7 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
-static inline bool task_will_free_mem(struct task_struct *task)
-{
-       struct signal_struct *sig = task->signal;
-
-       /*
-        * A coredumping process may sleep for an extended period in exit_mm(),
-        * so the oom killer cannot assume that the process will promptly exit
-        * and release memory.
-        */
-       if (sig->flags & SIGNAL_GROUP_COREDUMP)
-               return false;
-
-       if (!(task->flags & PF_EXITING))
-               return false;
-
-       /* Make sure that the whole thread group is going down */
-       if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
-               return false;
-
-       return true;
-}
+bool task_will_free_mem(struct task_struct *task);
 
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
index f3a84c64f35cf807c5e6d6ddbf6c601d5ff334a3..3e8f9e5e9291453222949dd4b13906940b63b187 100644 (file)
@@ -1276,9 +1276,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
-       if (fatal_signal_pending(current) || task_will_free_mem(current)) {
+       if (task_will_free_mem(current)) {
                mark_oom_victim(current);
-               try_oom_reaper(current);
+               wake_oom_reaper(current);
                goto unlock;
        }
 
index 38f89ac2df7f649561723b50c7688f36f614e97a..8ee92fb769689b79e720a6bb45bf8deaa6f11fd4 100644 (file)
@@ -596,7 +596,7 @@ static int oom_reaper(void *unused)
        return 0;
 }
 
-static void wake_oom_reaper(struct task_struct *tsk)
+void wake_oom_reaper(struct task_struct *tsk)
 {
        if (!oom_reaper_th)
                return;
@@ -614,46 +614,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
        wake_up(&oom_reaper_wait);
 }
 
-/* Check if we can reap the given task. This has to be called with stable
- * tsk->mm
- */
-void try_oom_reaper(struct task_struct *tsk)
-{
-       struct mm_struct *mm = tsk->mm;
-       struct task_struct *p;
-
-       if (!mm)
-               return;
-
-       /*
-        * There might be other threads/processes which are either not
-        * dying or even not killable.
-        */
-       if (atomic_read(&mm->mm_users) > 1) {
-               rcu_read_lock();
-               for_each_process(p) {
-                       if (!process_shares_mm(p, mm))
-                               continue;
-                       if (fatal_signal_pending(p))
-                               continue;
-
-                       /*
-                        * If the task is exiting make sure the whole thread group
-                        * is exiting and cannot acces mm anymore.
-                        */
-                       if (signal_group_exit(p->signal))
-                               continue;
-
-                       /* Give up */
-                       rcu_read_unlock();
-                       return;
-               }
-               rcu_read_unlock();
-       }
-
-       wake_oom_reaper(tsk);
-}
-
 static int __init oom_init(void)
 {
        oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -665,10 +625,6 @@ static int __init oom_init(void)
        return 0;
 }
 subsys_initcall(oom_init)
-#else
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-}
 #endif
 
 /**
@@ -745,6 +701,81 @@ void oom_killer_enable(void)
        oom_killer_disabled = false;
 }
 
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+       struct signal_struct *sig = task->signal;
+
+       /*
+        * A coredumping process may sleep for an extended period in exit_mm(),
+        * so the oom killer cannot assume that the process will promptly exit
+        * and release memory.
+        */
+       if (sig->flags & SIGNAL_GROUP_COREDUMP)
+               return false;
+
+       if (sig->flags & SIGNAL_GROUP_EXIT)
+               return true;
+
+       if (thread_group_empty(task) && (task->flags & PF_EXITING))
+               return true;
+
+       return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ */
+bool task_will_free_mem(struct task_struct *task)
+{
+       struct mm_struct *mm;
+       struct task_struct *p;
+       bool ret;
+
+       if (!__task_will_free_mem(task))
+               return false;
+
+       /*
+        * If the process has passed exit_mm we have to skip it because
+        * we have lost a link to other tasks sharing this mm, we do not
+        * have anything to reap and the task might then get stuck waiting
+        * for parent as zombie and we do not want it to hold TIF_MEMDIE
+        */
+       p = find_lock_task_mm(task);
+       if (!p)
+               return false;
+
+       mm = p->mm;
+       if (atomic_read(&mm->mm_users) <= 1) {
+               task_unlock(p);
+               return true;
+       }
+
+       /* pin the mm to not get freed and reused */
+       atomic_inc(&mm->mm_count);
+       task_unlock(p);
+
+       /*
+        * This is really pessimistic but we do not have any reliable way
+        * to check that external processes share with our mm
+        */
+       rcu_read_lock();
+       for_each_process(p) {
+               if (!process_shares_mm(p, mm))
+                       continue;
+               if (same_thread_group(task, p))
+                       continue;
+               ret = __task_will_free_mem(p);
+               if (!ret)
+                       break;
+       }
+       rcu_read_unlock();
+       mmdrop(mm);
+
+       return ret;
+}
+
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
@@ -766,15 +797,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
-       task_lock(p);
-       if (p->mm && task_will_free_mem(p)) {
+       if (task_will_free_mem(p)) {
                mark_oom_victim(p);
-               try_oom_reaper(p);
-               task_unlock(p);
+               wake_oom_reaper(p);
                put_task_struct(p);
                return;
        }
-       task_unlock(p);
 
        if (__ratelimit(&oom_rs))
                dump_header(oc, p);
@@ -944,10 +972,9 @@ bool out_of_memory(struct oom_control *oc)
         * But don't select if current has already released its mm and cleared
         * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
         */
-       if (current->mm &&
-           (fatal_signal_pending(current) || task_will_free_mem(current))) {
+       if (current->mm && task_will_free_mem(current)) {
                mark_oom_victim(current);
-               try_oom_reaper(current);
+               wake_oom_reaper(current);
                return true;
        }