mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Mon, 10 Jul 2017 22:47:50 +0000 (15:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Jul 2017 23:32:30 +0000 (16:32 -0700)
Currently me_huge_page() relies on dequeue_hwpoisoned_huge_page() to
keep the error hugepage away from the system, which is OK but not good
enough because the hugepage still has a refcount and unpoison doesn't
work on the error hugepage (PageHWPoison flags are cleared but pages are
still leaked.) And there's "wasting health subpages" issue too.  This
patch reworks on me_huge_page() to solve these issues.

For hugetlb file, recently we have truncating code so let's use it in
hugetlbfs specific ->error_remove_page().

For anonymous hugepage, it's helpful to dissolve the error page after
freeing it into free hugepage list.  Migration entry and PageHWPoison in
the head page prevent the access to it.

TODO: dissolve_free_huge_page() can fail but we don't considered it yet.
It's not critical (and at least no worse that now) because in such case
the error hugepage just stays in free hugepage list without being
dissolved.  By virtue of PageHWPoison in head page, it's never allocated
to processes.

[akpm@linux-foundation.org: fix unused var warnings]
Fixes: 23a003bfd23ea9ea0b7756b920e51f64b284b468 ("mm/madvise: pass return code of memory_failure() to userspace")
Link: http://lkml.kernel.org/r/20170417055948.GM31394@yexl-desktop
Link: http://lkml.kernel.org/r/1496305019-5493-8-git-send-email-n-horiguchi@ah.jp.nec.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/hugetlbfs/inode.c
mm/memory-failure.c

index d44f5456eb9baf943186a2d145736c6435f891fb..52388611635e29df0bd6286f3d97b0f1a137498e 100644 (file)
@@ -851,6 +851,16 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
        return MIGRATEPAGE_SUCCESS;
 }
 
+static int hugetlbfs_error_remove_page(struct address_space *mapping,
+                               struct page *page)
+{
+       struct inode *inode = mapping->host;
+
+       remove_huge_page(page);
+       hugetlb_fix_reserve_counts(inode);
+       return 0;
+}
+
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -966,6 +976,7 @@ static const struct address_space_operations hugetlbfs_aops = {
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
        .migratepage    = hugetlbfs_migrate_page,
+       .error_remove_page      = hugetlbfs_error_remove_page,
 };
 
 
index 5db3827f0d3690025f8d280061352f1231098aea..6f8f69f4a98669dabe2747c598adfd08d57767d5 100644 (file)
@@ -554,6 +554,39 @@ static int delete_from_lru_cache(struct page *p)
        return -EIO;
 }
 
+static int truncate_error_page(struct page *p, unsigned long pfn,
+                               struct address_space *mapping)
+{
+       int ret = MF_FAILED;
+
+       if (mapping->a_ops->error_remove_page) {
+               int err = mapping->a_ops->error_remove_page(mapping, p);
+
+               if (err != 0) {
+                       pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
+                               pfn, err);
+               } else if (page_has_private(p) &&
+                          !try_to_release_page(p, GFP_NOIO)) {
+                       pr_info("Memory failure: %#lx: failed to release buffers\n",
+                               pfn);
+               } else {
+                       ret = MF_RECOVERED;
+               }
+       } else {
+               /*
+                * If the file system doesn't support it just invalidate
+                * This fails on dirty or anything with private pages
+                */
+               if (invalidate_inode_page(p))
+                       ret = MF_RECOVERED;
+               else
+                       pr_info("Memory failure: %#lx: Failed to invalidate\n",
+                               pfn);
+       }
+
+       return ret;
+}
+
 /*
  * Error hit kernel page.
  * Do nothing, try to be lucky and not touch this instead. For a few cases we
@@ -578,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
-       int err;
-       int ret = MF_FAILED;
        struct address_space *mapping;
 
        delete_from_lru_cache(p);
@@ -611,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         *
         * Open: to take i_mutex or not for this? Right now we don't.
         */
-       if (mapping->a_ops->error_remove_page) {
-               err = mapping->a_ops->error_remove_page(mapping, p);
-               if (err != 0) {
-                       pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
-                               pfn, err);
-               } else if (page_has_private(p) &&
-                               !try_to_release_page(p, GFP_NOIO)) {
-                       pr_info("Memory failure: %#lx: failed to release buffers\n",
-                               pfn);
-               } else {
-                       ret = MF_RECOVERED;
-               }
-       } else {
-               /*
-                * If the file system doesn't support it just invalidate
-                * This fails on dirty or anything with private pages
-                */
-               if (invalidate_inode_page(p))
-                       ret = MF_RECOVERED;
-               else
-                       pr_info("Memory failure: %#lx: Failed to invalidate\n",
-                               pfn);
-       }
-       return ret;
+       return truncate_error_page(p, pfn, mapping);
 }
 
 /*
@@ -740,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 {
        int res = 0;
        struct page *hpage = compound_head(p);
+       struct address_space *mapping;
 
        if (!PageHuge(hpage))
                return MF_DELAYED;
 
-       /*
-        * We can safely recover from error on free or reserved (i.e.
-        * not in-use) hugepage by dequeuing it from freelist.
-        * To check whether a hugepage is in-use or not, we can't use
-        * page->lru because it can be used in other hugepage operations,
-        * such as __unmap_hugepage_range() and gather_surplus_pages().
-        * So instead we use page_mapping() and PageAnon().
-        */
-       if (!(page_mapping(hpage) || PageAnon(hpage))) {
-               res = dequeue_hwpoisoned_huge_page(hpage);
-               if (!res)
-                       return MF_RECOVERED;
+       mapping = page_mapping(hpage);
+       if (mapping) {
+               res = truncate_error_page(hpage, pfn, mapping);
+       } else {
+               unlock_page(hpage);
+               /*
+                * migration entry prevents later access on error anonymous
+                * hugepage, so we can free and dissolve it into buddy to
+                * save healthy subpages.
+                */
+               if (PageAnon(hpage))
+                       put_page(hpage);
+               dissolve_free_huge_page(p);
+               res = MF_RECOVERED;
+               lock_page(hpage);
        }
-       return MF_DELAYED;
+
+       return res;
 }
 
 /*
@@ -856,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p,
        count = page_count(p) - 1;
        if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
                count--;
-       if (count != 0) {
+       if (count > 0) {
                pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
                       pfn, action_page_types[ps->type], count);
                result = MF_FAILED;