mm: close PageTail race
authorDavid Rientjes <rientjes@google.com>
Mon, 3 Mar 2014 23:38:18 +0000 (15:38 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 4 Mar 2014 15:55:47 +0000 (07:55 -0800)
Commit bf6bddf1924e ("mm: introduce compaction and migration for
ballooned pages") introduces page_count(page) into memory compaction
which dereferences page->first_page if PageTail(page).

This results in a very rare NULL pointer dereference on the
aforementioned page_count(page).  Indeed, anything that does
compound_head(), including page_count() is susceptible to racing with
prep_compound_page() and seeing a NULL or dangling page->first_page
pointer.

This patch uses Andrea's implementation of compound_trans_head() that
deals with such a race and makes it the default compound_head()
implementation.  This includes a read memory barrier that ensures that
if PageTail(head) is true that we return a head page that is neither
NULL nor dangling.  The patch then adds a store memory barrier to
prep_compound_page() to ensure page->first_page is set.

This is the safest way to ensure we see the head page that we are
expecting, PageTail(page) is already in the unlikely() path and the
memory barriers are unfortunately required.

Hugetlbfs is the exception, we don't enforce a store memory barrier
during init since no race is possible.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Holger Kiehl <Holger.Kiehl@dwd.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
drivers/block/aoe/aoecmd.c
drivers/vfio/vfio_iommu_type1.c
fs/proc/page.c
include/linux/huge_mm.h
include/linux/mm.h
mm/ksm.c
mm/memory-failure.c
mm/page_alloc.c
mm/swap.c

index 8184451b57c04999bd23c286080e14ca7f069031..422b7d84f686b90147f97565210687d343a3d9eb 100644 (file)
@@ -874,7 +874,7 @@ bio_pageinc(struct bio *bio)
                /* Non-zero page count for non-head members of
                 * compound pages is no longer allowed by the kernel.
                 */
-               page = compound_trans_head(bv.bv_page);
+               page = compound_head(bv.bv_page);
                atomic_inc(&page->_count);
        }
 }
@@ -887,7 +887,7 @@ bio_pagedec(struct bio *bio)
        struct bvec_iter iter;
 
        bio_for_each_segment(bv, bio, iter) {
-               page = compound_trans_head(bv.bv_page);
+               page = compound_head(bv.bv_page);
                atomic_dec(&page->_count);
        }
 }
index 4fb7a8f83c8a99ff8d3412a5328b2407c98409a8..54af4e93369583e5629a4bac6d1ecdbd74f885cf 100644 (file)
@@ -186,12 +186,12 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
        if (pfn_valid(pfn)) {
                bool reserved;
                struct page *tail = pfn_to_page(pfn);
-               struct page *head = compound_trans_head(tail);
+               struct page *head = compound_head(tail);
                reserved = !!(PageReserved(head));
                if (head != tail) {
                        /*
                         * "head" is not a dangling pointer
-                        * (compound_trans_head takes care of that)
+                        * (compound_head takes care of that)
                         * but the hugepage may have been split
                         * from under us (and we may not hold a
                         * reference count on the head page so it can
index 02174a610315ebb218880a8744e05b2244b6d26f..e647c55275d9ff9f96b92ac859e65c10fc03d318 100644 (file)
@@ -121,9 +121,8 @@ u64 stable_page_flags(struct page *page)
         * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
         * to make sure a given page is a thp, not a non-huge compound page.
         */
-       else if (PageTransCompound(page) &&
-                (PageLRU(compound_trans_head(page)) ||
-                 PageAnon(compound_trans_head(page))))
+       else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+                                            PageAnon(compound_head(page))))
                u |= 1 << KPF_THP;
 
        /*
index db512014e061b27abae7d689175e82d74348683b..b826239bdce0b26a614ae0802782860348dd6fc6 100644 (file)
@@ -157,46 +157,6 @@ static inline int hpage_nr_pages(struct page *page)
                return HPAGE_PMD_NR;
        return 1;
 }
-/*
- * compound_trans_head() should be used instead of compound_head(),
- * whenever the "page" passed as parameter could be the tail of a
- * transparent hugepage that could be undergoing a
- * __split_huge_page_refcount(). The page structure layout often
- * changes across releases and it makes extensive use of unions. So if
- * the page structure layout will change in a way that
- * page->first_page gets clobbered by __split_huge_page_refcount, the
- * implementation making use of smp_rmb() will be required.
- *
- * Currently we define compound_trans_head as compound_head, because
- * page->private is in the same union with page->first_page, and
- * page->private isn't clobbered. However this also means we're
- * currently leaving dirt into the page->private field of anonymous
- * pages resulting from a THP split, instead of setting page->private
- * to zero like for every other page that has PG_private not set. But
- * anonymous pages don't use page->private so this is not a problem.
- */
-#if 0
-/* This will be needed if page->private will be clobbered in split_huge_page */
-static inline struct page *compound_trans_head(struct page *page)
-{
-       if (PageTail(page)) {
-               struct page *head;
-               head = page->first_page;
-               smp_rmb();
-               /*
-                * head may be a dangling pointer.
-                * __split_huge_page_refcount clears PageTail before
-                * overwriting first_page, so if PageTail is still
-                * there it means the head pointer isn't dangling.
-                */
-               if (PageTail(page))
-                       return head;
-       }
-       return page;
-}
-#else
-#define compound_trans_head(page) compound_head(page)
-#endif
 
 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp);
@@ -226,7 +186,6 @@ static inline int split_huge_page(struct page *page)
        do { } while (0)
 #define split_huge_page_pmd_mm(__mm, __address, __pmd) \
        do { } while (0)
-#define compound_trans_head(page) compound_head(page)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
 {
index f28f46eade6a642873246fea247f3f5455a18acb..03ab3e58f51119eb2fb446ff4278265f9322bbbc 100644 (file)
@@ -399,8 +399,18 @@ static inline void compound_unlock_irqrestore(struct page *page,
 
 static inline struct page *compound_head(struct page *page)
 {
-       if (unlikely(PageTail(page)))
-               return page->first_page;
+       if (unlikely(PageTail(page))) {
+               struct page *head = page->first_page;
+
+               /*
+                * page->first_page may be a dangling pointer to an old
+                * compound page, so recheck that it is still a tail
+                * page before returning.
+                */
+               smp_rmb();
+               if (likely(PageTail(page)))
+                       return head;
+       }
        return page;
 }
 
index aa4c7c7250c11a95b9676b70c7cc38f987d88717..68710e80994afed815c58b59a1a3cf421df8101f 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
 static struct page *page_trans_compound_anon(struct page *page)
 {
        if (PageTransCompound(page)) {
-               struct page *head = compound_trans_head(page);
+               struct page *head = compound_head(page);
                /*
                 * head may actually be splitted and freed from under
                 * us but it's ok here.
index 2f2f34a4e77de18e47bc815e3fd90ace33967e55..90002ea43638cd0dd669d12092837714ec3113f3 100644 (file)
@@ -1651,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
-       struct page *hpage = compound_trans_head(page);
+       struct page *hpage = compound_head(page);
 
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
index e3758a09a009747bd17cb75442d0fcae69a74cc4..3d1bf889465a0cc79ca0e84f254b6044eb309c33 100644 (file)
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-               __SetPageTail(p);
                set_page_count(p, 0);
                p->first_page = page;
+               /* Make sure p->first_page is always valid for PageTail() */
+               smp_wmb();
+               __SetPageTail(p);
        }
 }
 
index b31ba67d440ac997a05de372e831046bf98d9070..0092097b3f4ce5e22844759a9e264ef143d05c7c 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
        }
 
        /* __split_huge_page_refcount can run under us */
-       page_head = compound_trans_head(page);
+       page_head = compound_head(page);
 
        /*
         * THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
         */
        unsigned long flags;
        bool got;
-       struct page *page_head = compound_trans_head(page);
+       struct page *page_head = compound_head(page);
 
        /* Ref to put_compound_page() comment. */
        if (!__compound_tail_refcounted(page_head)) {