memcg: fix deadlock between cpuset and memcg

author Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

Thu, 13 Jan 2011 23:47:41 +0000 (15:47 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Jan 2011 01:32:51 +0000 (17:32 -0800)
author Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Thu, 13 Jan 2011 23:47:41 +0000 (15:47 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:32:51 +0000 (17:32 -0800)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 1b44ad64f2817a591feb873bb5eecc41aeae5919..c339d7431bda9e19993023479a0582f56e1f4ca3 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
         unsigned long moved_charge;
         unsigned long moved_swap;
         struct task_struct *moving_task;        /* a task moving charges */
-       struct mm_struct *mm;
         wait_queue_head_t waitq;                /* a waitq for other context */
  } mc = {
         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
         unsigned long precharge;
         struct vm_area_struct *vma;
  
-       /* We've already held the mmap_sem */
+       down_read(&mm->mmap_sem);
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
                 struct mm_walk mem_cgroup_count_precharge_walk = {
                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
                 walk_page_range(vma->vm_start, vma->vm_end,
                                         &mem_cgroup_count_precharge_walk);
         }
+       up_read(&mm->mmap_sem);
  
         precharge = mc.precharge;
         mc.precharge = 0;
@@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
-       return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+       unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+       VM_BUG_ON(mc.moving_task);
+       mc.moving_task = current;
+       return mem_cgroup_do_precharge(precharge);
  }
  
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
  {
         struct mem_cgroup *from = mc.from;
         struct mem_cgroup *to = mc.to;
@@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void)
                                                 PAGE_SIZE * mc.moved_swap);
                 }
                 /* we've already done mem_cgroup_get(mc.to) */
-
                 mc.moved_swap = 0;
         }
-       if (mc.mm) {
-               up_read(&mc.mm->mmap_sem);
-               mmput(mc.mm);
-       }
+       memcg_oom_recover(from);
+       memcg_oom_recover(to);
+       wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+       struct mem_cgroup *from = mc.from;
+
+       /*
+        * we must clear moving_task before waking up waiters at the end of
+        * task migration.
+        */
+       mc.moving_task = NULL;
+       __mem_cgroup_clear_mc();
         spin_lock(&mc.lock);
         mc.from = NULL;
         mc.to = NULL;
         spin_unlock(&mc.lock);
-       mc.moving_task = NULL;
-       mc.mm = NULL;
         mem_cgroup_end_move(from);
-       memcg_oom_recover(from);
-       memcg_oom_recover(to);
-       wake_up_all(&mc.waitq);
  }
  
  static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                         return 0;
                 /* We move charges only when we move a owner of the mm */
                 if (mm->owner == p) {
-                       /*
-                        * We do all the move charge works under one mmap_sem to
-                        * avoid deadlock with down_write(&mmap_sem)
-                        * -> try_charge() -> if (mc.moving_task) -> sleep.
-                        */
-                       down_read(&mm->mmap_sem);
-
                         VM_BUG_ON(mc.from);
                         VM_BUG_ON(mc.to);
                         VM_BUG_ON(mc.precharge);
                         VM_BUG_ON(mc.moved_charge);
                         VM_BUG_ON(mc.moved_swap);
-                       VM_BUG_ON(mc.moving_task);
-                       VM_BUG_ON(mc.mm);
-
                         mem_cgroup_start_move(from);
                         spin_lock(&mc.lock);
                         mc.from = from;
                         mc.to = mem;
-                       mc.precharge = 0;
-                       mc.moved_charge = 0;
-                       mc.moved_swap = 0;
                         spin_unlock(&mc.lock);
-                       mc.moving_task = current;
-                       mc.mm = mm;
+                       /* We set mc.moving_task later */
  
                         ret = mem_cgroup_precharge_mc(mm);
                         if (ret)
                                 mem_cgroup_clear_mc();
-                       /* We call up_read() and mmput() in clear_mc(). */
-               } else
-                       mmput(mm);
+               }
+               mmput(mm);
         }
         return ret;
  }
@@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
         struct vm_area_struct *vma;
  
         lru_add_drain_all();
-       /* We've already held the mmap_sem */
+retry:
+       if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+               /*
+                * Someone who are holding the mmap_sem might be waiting in
+                * waitq. So we cancel all extra charges, wake up all waiters,
+                * and retry. Because we cancel precharges, we might not be able
+                * to move enough charges, but moving charge is a best-effort
+                * feature anyway, so it wouldn't be a big problem.
+                */
+               __mem_cgroup_clear_mc();
+               cond_resched();
+               goto retry;
+       }
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
                 int ret;
                 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
                          */
                         break;
         }
+       up_read(&mm->mmap_sem);
  }
  
  static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                 struct task_struct *p,
                                 bool threadgroup)
  {
-       if (!mc.mm)
+       struct mm_struct *mm;
+
+       if (!mc.to)
                 /* no need to move charge */
                 return;
  
-       mem_cgroup_move_charge(mc.mm);
+       mm = get_task_mm(p);
+       if (mm) {
+               mem_cgroup_move_charge(mm);
+               mmput(mm);
+       }
         mem_cgroup_clear_mc();
  }
  #else  /* !CONFIG_MMU */
author	Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
	Thu, 13 Jan 2011 23:47:41 +0000 (15:47 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Jan 2011 01:32:51 +0000 (17:32 -0800)