Merge tag 'v3.10.63' into update

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 010d6c14129ae320ab9655a573d49af33e034659..6bc20990c12edb204075d33cbdf14518039361b3 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -302,6 +302,7 @@ struct mem_cgroup {
  
         bool            oom_lock;
         atomic_t        under_oom;
+       atomic_t        oom_wakeups;
  
         atomic_t        refcnt;
  
@@ -379,7 +380,7 @@ struct mem_cgroup {
  static size_t memcg_size(void)
  {
         return sizeof(struct mem_cgroup) +
-               nr_node_ids * sizeof(struct mem_cgroup_per_node);
+               nr_node_ids * sizeof(struct mem_cgroup_per_node *);
  }
  
  /* internal only representation about the status of kmem accounting. */
@@ -540,6 +541,21 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
         return (memcg == root_mem_cgroup);
  }
  
+#ifdef CONFIG_SWAP
+/* add_to_swap -> get_swap_page_by_memcg -> .. */
+bool memcg_is_root(struct page *page)
+{
+       struct page_cgroup *pc;
+
+       if (mem_cgroup_disabled())
+               return true;
+
+       pc = lookup_page_cgroup(page);
+
+       return mem_cgroup_is_root(pc->mem_cgroup);
+}
+#endif
+
  /* Writing them here to avoid exposing memcg's inner layout */
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
  
@@ -1199,7 +1215,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
  
                         mz = mem_cgroup_zoneinfo(root, nid, zid);
                         iter = &mz->reclaim_iter[reclaim->priority];
-                       last_visited = iter->last_visited;
                         if (prev && reclaim->generation != iter->generation) {
                                 iter->last_visited = NULL;
                                 goto out_unlock;
@@ -1218,20 +1233,19 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                          * is alive.
                          */
                         dead_count = atomic_read(&root->dead_count);
-                       smp_rmb();
-                       last_visited = iter->last_visited;
-                       if (last_visited) {
-                               if ((dead_count != iter->last_dead_count) ||
-                                       !css_tryget(&last_visited->css)) {
+                       if (dead_count == iter->last_dead_count) {
+                               smp_rmb();
+                               last_visited = iter->last_visited;
+                               if (last_visited && last_visited != root &&
+                                   !css_tryget(&last_visited->css))
                                         last_visited = NULL;
-                               }
                         }
                 }
  
                 memcg = __mem_cgroup_iter_next(root, last_visited);
  
                 if (reclaim) {
-                       if (last_visited)
+                       if (last_visited && last_visited != root)
                                 css_put(&last_visited->css);
  
                         iter->last_visited = memcg;
@@ -2077,15 +2091,18 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
         return total;
  }
  
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
- * Has to be called with memcg_oom_lock
   */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *iter, *failed = NULL;
  
+       spin_lock(&memcg_oom_lock);
+
         for_each_mem_cgroup_tree(iter, memcg) {
                 if (iter->oom_lock) {
                         /*
@@ -2099,33 +2116,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
                         iter->oom_lock = true;
         }
  
-       if (!failed)
-               return true;
-
-       /*
-        * OK, we failed to lock the whole subtree so we have to clean up
-        * what we set up to the failing subtree
-        */
-       for_each_mem_cgroup_tree(iter, memcg) {
-               if (iter == failed) {
-                       mem_cgroup_iter_break(memcg, iter);
-                       break;
+       if (failed) {
+               /*
+                * OK, we failed to lock the whole subtree so we have
+                * to clean up what we set up to the failing subtree
+                */
+               for_each_mem_cgroup_tree(iter, memcg) {
+                       if (iter == failed) {
+                               mem_cgroup_iter_break(memcg, iter);
+                               break;
+                       }
+                       iter->oom_lock = false;
                 }
-               iter->oom_lock = false;
         }
-       return false;
+
+       spin_unlock(&memcg_oom_lock);
+
+       return !failed;
  }
  
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *iter;
  
+       spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
                 iter->oom_lock = false;
-       return 0;
+       spin_unlock(&memcg_oom_lock);
  }
  
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2149,7 +2166,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
                 atomic_add_unless(&iter->under_oom, -1, 0);
  }
  
-static DEFINE_SPINLOCK(memcg_oom_lock);
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
  
  struct oom_wait_info {
@@ -2179,6 +2195,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
  
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
  {
+       atomic_inc(&memcg->oom_wakeups);
         /* for filtering, pass "memcg" as argument. */
         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
@@ -2189,57 +2206,97 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                 memcg_wakeup_oom(memcg);
  }
  
-/*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+{
+       if (!current->memcg_oom.may_oom)
+               return;
+       /*
+        * We are in the middle of the charge context here, so we
+        * don't want to block when potentially sitting on a callstack
+        * that holds all kinds of filesystem and mm locks.
+        *
+        * Also, the caller may handle a failed allocation gracefully
+        * (like optional page cache readahead) and so an OOM killer
+        * invocation might not even be necessary.
+        *
+        * That's why we don't do anything here except remember the
+        * OOM context and then deal with it at the end of the page
+        * fault when the stack is unwound, the locks are released,
+        * and when we know whether the fault was overall successful.
+        */
+       css_get(&memcg->css);
+       current->memcg_oom.memcg = memcg;
+       current->memcg_oom.gfp_mask = mask;
+       current->memcg_oom.order = order;
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
+ *
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
+ *
+ * Memcg supports userspace OOM handling where failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to complete the OOM handling.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * completed, %false otherwise.
   */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-                                 int order)
+bool mem_cgroup_oom_synchronize(bool handle)
  {
+       struct mem_cgroup *memcg = current->memcg_oom.memcg;
         struct oom_wait_info owait;
-       bool locked, need_to_kill;
+       bool locked;
+
+       /* OOM is global, do not handle */
+       if (!memcg)
+               return false;
+
+       if (!handle)
+               goto cleanup;
  
         owait.memcg = memcg;
         owait.wait.flags = 0;
         owait.wait.func = memcg_oom_wake_function;
         owait.wait.private = current;
         INIT_LIST_HEAD(&owait.wait.task_list);
-       need_to_kill = true;
-       mem_cgroup_mark_under_oom(memcg);
  
-       /* At first, try to OOM lock hierarchy under memcg.*/
-       spin_lock(&memcg_oom_lock);
-       locked = mem_cgroup_oom_lock(memcg);
-       /*
-        * Even if signal_pending(), we can't quit charge() loop without
-        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-        * under OOM is always welcomed, use TASK_KILLABLE here.
-        */
         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       if (!locked || memcg->oom_kill_disable)
-               need_to_kill = false;
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
         if (locked)
                 mem_cgroup_oom_notify(memcg);
-       spin_unlock(&memcg_oom_lock);
  
-       if (need_to_kill) {
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
                 finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, mask, order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                        current->memcg_oom.order);
         } else {
                 schedule();
+               mem_cgroup_unmark_under_oom(memcg);
                 finish_wait(&memcg_oom_waitq, &owait.wait);
         }
-       spin_lock(&memcg_oom_lock);
-       if (locked)
-               mem_cgroup_oom_unlock(memcg);
-       memcg_wakeup_oom(memcg);
-       spin_unlock(&memcg_oom_lock);
  
-       mem_cgroup_unmark_under_oom(memcg);
-
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               return false;
-       /* Give chance to dying process */
-       schedule_timeout_uninterruptible(1);
+       if (locked) {
+               mem_cgroup_oom_unlock(memcg);
+               /*
+                * There is no guarantee that an OOM-lock contender
+                * sees the wakeups triggered by the OOM kill
+                * uncharges.  Wake any sleepers explicitely.
+                */
+               memcg_oom_recover(memcg);
+       }
+cleanup:
+       current->memcg_oom.memcg = NULL;
+       css_put(&memcg->css);
         return true;
  }
  
@@ -2552,12 +2609,11 @@ enum {
         CHARGE_RETRY,           /* need to retry but retry is not bad */
         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
-       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
  };
  
  static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 unsigned int nr_pages, unsigned int min_pages,
-                               bool oom_check)
+                               bool invoke_oom)
  {
         unsigned long csize = nr_pages * PAGE_SIZE;
         struct mem_cgroup *mem_over_limit;
@@ -2614,14 +2670,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         if (mem_cgroup_wait_acct_move(mem_over_limit))
                 return CHARGE_RETRY;
  
-       /* If we don't need to call oom-killer at el, return immediately */
-       if (!oom_check)
-               return CHARGE_NOMEM;
-       /* check OOM */
-       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
-               return CHARGE_OOM_DIE;
+       if (invoke_oom)
+               mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
  
-       return CHARGE_RETRY;
+       return CHARGE_NOMEM;
  }
  
  /*
@@ -2665,6 +2717,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                      || fatal_signal_pending(current)))
                 goto bypass;
  
+       if (unlikely(task_in_memcg_oom(current)))
+               goto bypass;
+
         /*
          * We always charge the cgroup the mm_struct belongs to.
          * The mm_struct's mem_cgroup changes on task migration if the
@@ -2724,7 +2779,7 @@ again:
         }
  
         do {
-               bool oom_check;
+               bool invoke_oom = oom && !nr_oom_retries;
  
                 /* If killed, bypass charge */
                 if (fatal_signal_pending(current)) {
@@ -2732,14 +2787,8 @@ again:
                         goto bypass;
                 }
  
-               oom_check = false;
-               if (oom && !nr_oom_retries) {
-                       oom_check = true;
-                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-               }
-
-               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
-                   oom_check);
+               ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+                                          nr_pages, invoke_oom);
                 switch (ret) {
                 case CHARGE_OK:
                         break;
@@ -2752,16 +2801,12 @@ again:
                         css_put(&memcg->css);
                         goto nomem;
                 case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom) {
+                       if (!oom || invoke_oom) {
                                 css_put(&memcg->css);
                                 goto nomem;
                         }
-                       /* If oom, we never return -ENOMEM */
                         nr_oom_retries--;
                         break;
-               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-                       css_put(&memcg->css);
-                       goto bypass;
                 }
         } while (ret != CHARGE_OK);
  
@@ -3141,8 +3186,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                         return -ENOMEM;
                 }
  
-               INIT_WORK(&s->memcg_params->destroy,
-                               kmem_cache_destroy_work_func);
                 s->memcg_params->is_root_cache = true;
  
                 /*
@@ -3190,11 +3233,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
         if (!s->memcg_params)
                 return -ENOMEM;
  
-       INIT_WORK(&s->memcg_params->destroy,
-                       kmem_cache_destroy_work_func);
         if (memcg) {
                 s->memcg_params->memcg = memcg;
                 s->memcg_params->root_cache = root_cache;
+               INIT_WORK(&s->memcg_params->destroy,
+                               kmem_cache_destroy_work_func);
         } else
                 s->memcg_params->is_root_cache = true;
  
@@ -5588,7 +5631,13 @@ static int compare_thresholds(const void *a, const void *b)
         const struct mem_cgroup_threshold *_a = a;
         const struct mem_cgroup_threshold *_b = b;
  
-       return _a->threshold - _b->threshold;
+       if (_a->threshold > _b->threshold)
+               return 1;
+
+       if (_a->threshold < _b->threshold)
+               return -1;
+
+       return 0;
  }
  
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -6300,16 +6349,6 @@ mem_cgroup_css_online(struct cgroup *cont)
  
         error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
         mutex_unlock(&memcg_create_mutex);
-       if (error) {
-               /*
-                * We call put now because our (and parent's) refcnts
-                * are already in place. mem_cgroup_put() will internally
-                * call __mem_cgroup_free, so return directly
-                */
-               mem_cgroup_put(memcg);
-               if (parent->use_hierarchy)
-                       mem_cgroup_put(parent);
-       }
         return error;
  }
  
@@ -6334,9 +6373,23 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
  static void mem_cgroup_css_offline(struct cgroup *cont)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       struct cgroup *iter;
  
         mem_cgroup_invalidate_reclaim_iterators(memcg);
+
+       /*
+        * This requires that offlining is serialized.  Right now that is
+        * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+        */
+       rcu_read_lock();
+       cgroup_for_each_descendant_post(iter, cont) {
+               rcu_read_unlock();
+               mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter));
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
         mem_cgroup_reparent_charges(memcg);
+
         mem_cgroup_destroy_all_caches(memcg);
  }