return todo ? -EBUSY : 0;
}
-static bool __check_frozen_processes(void)
-{
- struct task_struct *g, *p;
-
- for_each_process_thread(g, p)
- if (p != current && !freezer_should_skip(p) && !frozen(p))
- return false;
-
- return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
- bool ret;
-
- read_lock(&tasklist_lock);
- ret = __check_frozen_processes();
- read_unlock(&tasklist_lock);
- return ret;
-}
-
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
int freeze_processes(void)
{
int error;
- int oom_kills_saved;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
pm_wakeup_clear();
pr_info("Freezing user space processes ... ");
pm_freezing = true;
- oom_kills_saved = oom_kills_count();
error = try_to_freeze_tasks(true);
if (!error) {
__usermodehelper_set_disable_depth(UMH_DISABLED);
- oom_killer_disable();
-
- /*
- * There might have been an OOM kill while we were
- * freezing tasks and the killed task might be still
- * on the way out so we have to double check for race.
- */
- if (oom_kills_count() != oom_kills_saved &&
- !check_frozen_processes()) {
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- pr_cont("OOM in progress.");
- error = -EBUSY;
- } else {
- pr_cont("done.");
- }
+ pr_cont("done.");
}
pr_cont("\n");
BUG_ON(in_atomic());
+ /*
+ * Now that the whole userspace is frozen we need to disbale
+ * the OOM killer to disallow any further interference with
+ * killable tasks.
+ */
+ if (!error && !oom_killer_disable())
+ error = -EBUSY;
+
if (error)
thaw_processes();
return error;
}
/*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
*/
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
-{
- return atomic_read(&oom_kills);
-}
-
-void note_oom_kill(void)
-{
- atomic_inc(&oom_kills);
-}
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
/**
* mark_tsk_oom_victim - marks the given taks as OOM victim.
* @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
*/
void mark_tsk_oom_victim(struct task_struct *tsk)
{
- set_tsk_thread_flag(tsk, TIF_MEMDIE);
-
+ WARN_ON(oom_killer_disabled);
+ /* OOM killer might race with memcg OOM */
+ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
* that TIF_MEMDIE tasks should be ignored.
*/
__thaw_task(tsk);
+ atomic_inc(&oom_victims);
}
/**
* unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
*/
void unmark_oom_victim(void)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_thread_flag(TIF_MEMDIE))
+ return;
+
+ down_read(&oom_sem);
+ /*
+ * There is no need to signal the lasst oom_victim if there
+ * is nobody who cares.
+ */
+ if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ wake_up_all(&oom_victims_wait);
+ up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+ /*
+ * Make sure to not race with an ongoing OOM killer
+ * and that the current is not the victim.
+ */
+ down_write(&oom_sem);
+ if (test_thread_flag(TIF_MEMDIE)) {
+ up_write(&oom_sem);
+ return false;
+ }
+
+ oom_killer_disabled = true;
+ up_write(&oom_sem);
+
+ wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+ return true;
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+ down_write(&oom_sem);
+ oom_killer_disabled = false;
+ up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
}
/**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
schedule_timeout_killable(1);
}
+/**
+ * out_of_memory - tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
+{
+ bool ret = false;
+
+ down_read(&oom_sem);
+ if (!oom_killer_disabled) {
+ __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+ ret = true;
+ }
+ up_read(&oom_sem);
+
+ return ret;
+}
+
/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
{
struct zonelist *zonelist;
+ down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- return;
+ goto unlock;
zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- out_of_memory(NULL, 0, 0, NULL, false);
+ if (!oom_killer_disabled)
+ __out_of_memory(NULL, 0, 0, NULL, false);
+ else
+ /*
+ * There shouldn't be any user tasks runable while the
+ * OOM killer is disabled so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
+
oom_zonelist_unlock(zonelist, GFP_KERNEL);
}
+unlock:
+ up_read(&oom_sem);
}
PB_migrate, PB_migrate_end);
}
-bool oom_killer_disabled __read_mostly;
-
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
*did_some_progress = 0;
- if (oom_killer_disabled)
- return NULL;
-
/*
* Acquire the per-zone oom lock for each zone. If that
* fails, somebody else is making progress for us.
return NULL;
}
- /*
- * PM-freezer should be notified that there might be an OOM killer on
- * its way to kill and wake somebody up. This is too early and we might
- * end up not killing anything but false positives are acceptable.
- * See freeze_processes.
- */
- note_oom_kill();
-
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
- *did_some_progress = 1;
+ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+ *did_some_progress = 1;
out:
oom_zonelist_unlock(ac->zonelist, gfp_mask);
return page;