Merge tag 'v3.10.63' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / mm / memcontrol.c
index 010d6c14129ae320ab9655a573d49af33e034659..6bc20990c12edb204075d33cbdf14518039361b3 100644 (file)
@@ -302,6 +302,7 @@ struct mem_cgroup {
 
        bool            oom_lock;
        atomic_t        under_oom;
+       atomic_t        oom_wakeups;
 
        atomic_t        refcnt;
 
@@ -379,7 +380,7 @@ struct mem_cgroup {
 static size_t memcg_size(void)
 {
        return sizeof(struct mem_cgroup) +
-               nr_node_ids * sizeof(struct mem_cgroup_per_node);
+               nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 }
 
 /* internal only representation about the status of kmem accounting. */
@@ -540,6 +541,21 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
        return (memcg == root_mem_cgroup);
 }
 
+#ifdef CONFIG_SWAP
+/* add_to_swap -> get_swap_page_by_memcg -> .. */
+bool memcg_is_root(struct page *page)
+{
+       struct page_cgroup *pc;
+
+       if (mem_cgroup_disabled())
+               return true;
+
+       pc = lookup_page_cgroup(page);
+
+       return mem_cgroup_is_root(pc->mem_cgroup);
+}
+#endif
+
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 
@@ -1199,7 +1215,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 
                        mz = mem_cgroup_zoneinfo(root, nid, zid);
                        iter = &mz->reclaim_iter[reclaim->priority];
-                       last_visited = iter->last_visited;
                        if (prev && reclaim->generation != iter->generation) {
                                iter->last_visited = NULL;
                                goto out_unlock;
@@ -1218,20 +1233,19 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                         * is alive.
                         */
                        dead_count = atomic_read(&root->dead_count);
-                       smp_rmb();
-                       last_visited = iter->last_visited;
-                       if (last_visited) {
-                               if ((dead_count != iter->last_dead_count) ||
-                                       !css_tryget(&last_visited->css)) {
+                       if (dead_count == iter->last_dead_count) {
+                               smp_rmb();
+                               last_visited = iter->last_visited;
+                               if (last_visited && last_visited != root &&
+                                   !css_tryget(&last_visited->css))
                                        last_visited = NULL;
-                               }
                        }
                }
 
                memcg = __mem_cgroup_iter_next(root, last_visited);
 
                if (reclaim) {
-                       if (last_visited)
+                       if (last_visited && last_visited != root)
                                css_put(&last_visited->css);
 
                        iter->last_visited = memcg;
@@ -2077,15 +2091,18 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
        return total;
 }
 
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
- * Has to be called with memcg_oom_lock
  */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter, *failed = NULL;
 
+       spin_lock(&memcg_oom_lock);
+
        for_each_mem_cgroup_tree(iter, memcg) {
                if (iter->oom_lock) {
                        /*
@@ -2099,33 +2116,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
                        iter->oom_lock = true;
        }
 
-       if (!failed)
-               return true;
-
-       /*
-        * OK, we failed to lock the whole subtree so we have to clean up
-        * what we set up to the failing subtree
-        */
-       for_each_mem_cgroup_tree(iter, memcg) {
-               if (iter == failed) {
-                       mem_cgroup_iter_break(memcg, iter);
-                       break;
+       if (failed) {
+               /*
+                * OK, we failed to lock the whole subtree so we have
+                * to clean up what we set up to the failing subtree
+                */
+               for_each_mem_cgroup_tree(iter, memcg) {
+                       if (iter == failed) {
+                               mem_cgroup_iter_break(memcg, iter);
+                               break;
+                       }
+                       iter->oom_lock = false;
                }
-               iter->oom_lock = false;
        }
-       return false;
+
+       spin_unlock(&memcg_oom_lock);
+
+       return !failed;
 }
 
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
 
+       spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
                iter->oom_lock = false;
-       return 0;
+       spin_unlock(&memcg_oom_lock);
 }
 
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2149,7 +2166,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
                atomic_add_unless(&iter->under_oom, -1, 0);
 }
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
@@ -2179,6 +2195,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
 
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
+       atomic_inc(&memcg->oom_wakeups);
        /* for filtering, pass "memcg" as argument. */
        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
@@ -2189,57 +2206,97 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                memcg_wakeup_oom(memcg);
 }
 
-/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+{
+       if (!current->memcg_oom.may_oom)
+               return;
+       /*
+        * We are in the middle of the charge context here, so we
+        * don't want to block when potentially sitting on a callstack
+        * that holds all kinds of filesystem and mm locks.
+        *
+        * Also, the caller may handle a failed allocation gracefully
+        * (like optional page cache readahead) and so an OOM killer
+        * invocation might not even be necessary.
+        *
+        * That's why we don't do anything here except remember the
+        * OOM context and then deal with it at the end of the page
+        * fault when the stack is unwound, the locks are released,
+        * and when we know whether the fault was overall successful.
+        */
+       css_get(&memcg->css);
+       current->memcg_oom.memcg = memcg;
+       current->memcg_oom.gfp_mask = mask;
+       current->memcg_oom.order = order;
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
+ *
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
+ *
+ * Memcg supports userspace OOM handling where failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to complete the OOM handling.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * completed, %false otherwise.
  */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-                                 int order)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+       struct mem_cgroup *memcg = current->memcg_oom.memcg;
        struct oom_wait_info owait;
-       bool locked, need_to_kill;
+       bool locked;
+
+       /* OOM is global, do not handle */
+       if (!memcg)
+               return false;
+
+       if (!handle)
+               goto cleanup;
 
        owait.memcg = memcg;
        owait.wait.flags = 0;
        owait.wait.func = memcg_oom_wake_function;
        owait.wait.private = current;
        INIT_LIST_HEAD(&owait.wait.task_list);
-       need_to_kill = true;
-       mem_cgroup_mark_under_oom(memcg);
 
-       /* At first, try to OOM lock hierarchy under memcg.*/
-       spin_lock(&memcg_oom_lock);
-       locked = mem_cgroup_oom_lock(memcg);
-       /*
-        * Even if signal_pending(), we can't quit charge() loop without
-        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-        * under OOM is always welcomed, use TASK_KILLABLE here.
-        */
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       if (!locked || memcg->oom_kill_disable)
-               need_to_kill = false;
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
        if (locked)
                mem_cgroup_oom_notify(memcg);
-       spin_unlock(&memcg_oom_lock);
 
-       if (need_to_kill) {
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
                finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, mask, order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                        current->memcg_oom.order);
        } else {
                schedule();
+               mem_cgroup_unmark_under_oom(memcg);
                finish_wait(&memcg_oom_waitq, &owait.wait);
        }
-       spin_lock(&memcg_oom_lock);
-       if (locked)
-               mem_cgroup_oom_unlock(memcg);
-       memcg_wakeup_oom(memcg);
-       spin_unlock(&memcg_oom_lock);
 
-       mem_cgroup_unmark_under_oom(memcg);
-
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               return false;
-       /* Give chance to dying process */
-       schedule_timeout_uninterruptible(1);
+       if (locked) {
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+                * sees the wakeups triggered by the OOM kill
+                * uncharges.  Wake any sleepers explicitely.
+                */
+               memcg_oom_recover(memcg);
+       }
+cleanup:
+       current->memcg_oom.memcg = NULL;
+       css_put(&memcg->css);
        return true;
 }
 
@@ -2552,12 +2609,11 @@ enum {
        CHARGE_RETRY,           /* need to retry but retry is not bad */
        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
 };
 
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                unsigned int nr_pages, unsigned int min_pages,
-                               bool oom_check)
+                               bool invoke_oom)
 {
        unsigned long csize = nr_pages * PAGE_SIZE;
        struct mem_cgroup *mem_over_limit;
@@ -2614,14 +2670,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (mem_cgroup_wait_acct_move(mem_over_limit))
                return CHARGE_RETRY;
 
-       /* If we don't need to call oom-killer at el, return immediately */
-       if (!oom_check)
-               return CHARGE_NOMEM;
-       /* check OOM */
-       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
-               return CHARGE_OOM_DIE;
+       if (invoke_oom)
+               mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
 
-       return CHARGE_RETRY;
+       return CHARGE_NOMEM;
 }
 
 /*
@@ -2665,6 +2717,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                     || fatal_signal_pending(current)))
                goto bypass;
 
+       if (unlikely(task_in_memcg_oom(current)))
+               goto bypass;
+
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
@@ -2724,7 +2779,7 @@ again:
        }
 
        do {
-               bool oom_check;
+               bool invoke_oom = oom && !nr_oom_retries;
 
                /* If killed, bypass charge */
                if (fatal_signal_pending(current)) {
@@ -2732,14 +2787,8 @@ again:
                        goto bypass;
                }
 
-               oom_check = false;
-               if (oom && !nr_oom_retries) {
-                       oom_check = true;
-                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-               }
-
-               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
-                   oom_check);
+               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+                                          nr_pages, invoke_oom);
                switch (ret) {
                case CHARGE_OK:
                        break;
@@ -2752,16 +2801,12 @@ again:
                        css_put(&memcg->css);
                        goto nomem;
                case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom) {
+                       if (!oom || invoke_oom) {
                                css_put(&memcg->css);
                                goto nomem;
                        }
-                       /* If oom, we never return -ENOMEM */
                        nr_oom_retries--;
                        break;
-               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-                       css_put(&memcg->css);
-                       goto bypass;
                }
        } while (ret != CHARGE_OK);
 
@@ -3141,8 +3186,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                        return -ENOMEM;
                }
 
-               INIT_WORK(&s->memcg_params->destroy,
-                               kmem_cache_destroy_work_func);
                s->memcg_params->is_root_cache = true;
 
                /*
@@ -3190,11 +3233,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
        if (!s->memcg_params)
                return -ENOMEM;
 
-       INIT_WORK(&s->memcg_params->destroy,
-                       kmem_cache_destroy_work_func);
        if (memcg) {
                s->memcg_params->memcg = memcg;
                s->memcg_params->root_cache = root_cache;
+               INIT_WORK(&s->memcg_params->destroy,
+                               kmem_cache_destroy_work_func);
        } else
                s->memcg_params->is_root_cache = true;
 
@@ -5588,7 +5631,13 @@ static int compare_thresholds(const void *a, const void *b)
        const struct mem_cgroup_threshold *_a = a;
        const struct mem_cgroup_threshold *_b = b;
 
-       return _a->threshold - _b->threshold;
+       if (_a->threshold > _b->threshold)
+               return 1;
+
+       if (_a->threshold < _b->threshold)
+               return -1;
+
+       return 0;
 }
 
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -6300,16 +6349,6 @@ mem_cgroup_css_online(struct cgroup *cont)
 
        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
        mutex_unlock(&memcg_create_mutex);
-       if (error) {
-               /*
-                * We call put now because our (and parent's) refcnts
-                * are already in place. mem_cgroup_put() will internally
-                * call __mem_cgroup_free, so return directly
-                */
-               mem_cgroup_put(memcg);
-               if (parent->use_hierarchy)
-                       mem_cgroup_put(parent);
-       }
        return error;
 }
 
@@ -6334,9 +6373,23 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct cgroup *iter;
 
        mem_cgroup_invalidate_reclaim_iterators(memcg);
+
+       /*
+        * This requires that offlining is serialized.  Right now that is
+        * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+        */
+       rcu_read_lock();
+       cgroup_for_each_descendant_post(iter, cont) {
+               rcu_read_unlock();
+               mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter));
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
        mem_cgroup_reparent_charges(memcg);
+
        mem_cgroup_destroy_all_caches(memcg);
 }