mm/memory-failure: split thp earlier in memory error handling
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Wed, 24 Jun 2015 23:56:45 +0000 (16:56 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 00:49:42 +0000 (17:49 -0700)
memory_failure() doesn't handle thp itself at this time and need to split
it before doing isolation.  Currently thp is split in the middle of
hwpoison_user_mappings(), but there're corner cases where memory_failure()
wrongly tries to handle thp without splitting.

1) "non anonymous" thp, which is not a normal operating mode of thp,
   but a memory error could hit a thp before anon_vma is initialized.  In
   such case, split_huge_page() fails and me_huge_page() (intended for
   hugetlb) is called for thp, which triggers BUG_ON in page_hstate().

2) !PageLRU case, where hwpoison_user_mappings() returns with
   SWAP_SUCCESS and the result is the same as case 1.

memory_failure() can't avoid splitting, so let's split it more earlier,
which also reduces code which are prepared for both of normal page and
thp.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/memory-failure.c

index 8e71b6e641ad92528bd127862382594ba6b9ceda..17a8e3bc3b0151a1ce8194ca42628b922ed23cd1 100644 (file)
@@ -928,7 +928,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
-       struct page *ppage;
 
        /*
         * Here we are interested only in user-mapped pages, so skip any
@@ -977,59 +976,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                }
        }
 
-       /*
-        * ppage: poisoned page
-        *   if p is regular page(4k page)
-        *        ppage == real poisoned page;
-        *   else p is hugetlb or THP, ppage == head page.
-        */
-       ppage = hpage;
-
-       if (PageTransHuge(hpage)) {
-               /*
-                * Verify that this isn't a hugetlbfs head page, the check for
-                * PageAnon is just for avoid tripping a split_huge_page
-                * internal debug check, as split_huge_page refuses to deal with
-                * anything that isn't an anon page. PageAnon can't go away fro
-                * under us because we hold a refcount on the hpage, without a
-                * refcount on the hpage. split_huge_page can't be safely called
-                * in the first place, having a refcount on the tail isn't
-                * enough * to be safe.
-                */
-               if (!PageHuge(hpage) && PageAnon(hpage)) {
-                       if (unlikely(split_huge_page(hpage))) {
-                               /*
-                                * FIXME: if splitting THP is failed, it is
-                                * better to stop the following operation rather
-                                * than causing panic by unmapping. System might
-                                * survive if the page is freed later.
-                                */
-                               printk(KERN_INFO
-                                       "MCE %#lx: failed to split THP\n", pfn);
-
-                               BUG_ON(!PageHWPoison(p));
-                               return SWAP_FAIL;
-                       }
-                       /*
-                        * We pinned the head page for hwpoison handling,
-                        * now we split the thp and we are interested in
-                        * the hwpoisoned raw page, so move the refcount
-                        * to it. Similarly, page lock is shifted.
-                        */
-                       if (hpage != p) {
-                               if (!(flags & MF_COUNT_INCREASED)) {
-                                       put_page(hpage);
-                                       get_page(p);
-                               }
-                               lock_page(p);
-                               unlock_page(hpage);
-                               *hpagep = p;
-                       }
-                       /* THP is split, so ppage should be the real poisoned page. */
-                       ppage = p;
-               }
-       }
-
        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
@@ -1039,12 +985,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-               collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-       ret = try_to_unmap(ppage, ttu);
+       ret = try_to_unmap(hpage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                               pfn, page_mapcount(ppage));
+                               pfn, page_mapcount(hpage));
 
        /*
         * Now that the dirty bit has been propagated to the
@@ -1056,7 +1002,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-       forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+       forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
 
@@ -1102,6 +1048,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page_state *ps;
        struct page *p;
        struct page *hpage;
+       struct page *orig_head;
        int res;
        unsigned int nr_pages;
        unsigned long page_flags;
@@ -1117,7 +1064,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        p = pfn_to_page(pfn);
-       hpage = compound_head(p);
+       orig_head = hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
@@ -1180,6 +1127,21 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                }
        }
 
+       if (!PageHuge(p) && PageTransHuge(hpage)) {
+               if (!PageAnon(hpage)) {
+                       pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+                       put_page(p);
+                       return -EBUSY;
+               }
+               if (unlikely(split_huge_page(hpage))) {
+                       pr_err("MCE: %#lx: thp split failed\n", pfn);
+                       put_page(p);
+                       return -EBUSY;
+               }
+               VM_BUG_ON_PAGE(!page_count(p), p);
+               hpage = compound_head(p);
+       }
+
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
@@ -1189,9 +1151,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageHuge(p)) {
-               if (!PageLRU(hpage))
-                       shake_page(hpage, 0);
-               if (!PageLRU(hpage)) {
+               if (!PageLRU(p))
+                       shake_page(p, 0);
+               if (!PageLRU(p)) {
                        /*
                         * shake_page could have turned it free.
                         */
@@ -1212,7 +1174,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * The page could have changed compound pages during the locking.
         * If this happens just bail out.
         */
-       if (compound_head(p) != hpage) {
+       if (PageCompound(p) && compound_head(p) != orig_head) {
                action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
                res = -EBUSY;
                goto out;