mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / mm / huge_memory.c
index 03a89a2f464bef283770e84ad7186a5cc0915924..3877483a20fd1801b32efe4706f34ba1d7d97a7b 100644 (file)
@@ -1166,7 +1166,7 @@ alloc:
 
        if (unlikely(!new_page)) {
                count_vm_event(THP_FAULT_FALLBACK);
-               if (is_huge_zero_pmd(orig_pmd)) {
+               if (!page) {
                        ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
                                        address, pmd, orig_pmd, haddr);
                } else {
@@ -1190,7 +1190,7 @@ alloc:
                goto out;
        }
 
-       if (is_huge_zero_pmd(orig_pmd))
+       if (!page)
                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
        else
                copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1215,7 +1215,7 @@ alloc:
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
-               if (is_huge_zero_pmd(orig_pmd)) {
+               if (!page) {
                        add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                        put_huge_zero_page();
                } else {
@@ -1235,6 +1235,18 @@ out_unlock:
        return ret;
 }
 
+/*
+ * foll_force can write to even unwritable pmd's, but only
+ * after we've gone through a cow cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+                                       unsigned int flags)
+{
+       return pmd_write(pmd) ||
+               ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+                page && PageAnon(page));
+}
+
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
@@ -1245,15 +1257,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
        assert_spin_locked(&mm->page_table_lock);
 
-       if (flags & FOLL_WRITE && !pmd_write(*pmd))
-               goto out;
-
        /* Avoid dumping huge zero page */
        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
                return ERR_PTR(-EFAULT);
 
        page = pmd_page(*pmd);
        VM_BUG_ON(!PageHead(page));
+
+       if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, page, flags))
+               return NULL;
+
        if (flags & FOLL_TOUCH) {
                pmd_t _pmd;
                /*
@@ -1288,64 +1301,104 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+       struct anon_vma *anon_vma = NULL;
        struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
+       int page_nid = -1, this_nid = numa_node_id();
        int target_nid;
-       int current_nid = -1;
-       bool migrated;
+       bool page_locked;
+       bool migrated = false;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
 
        page = pmd_page(pmd);
-       get_page(page);
-       current_nid = page_to_nid(page);
+       page_nid = page_to_nid(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (current_nid == numa_node_id())
+       if (page_nid == this_nid)
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
+       /*
+        * Acquire the page lock to serialise THP migrations but avoid dropping
+        * page_table_lock if at all possible
+        */
+       page_locked = trylock_page(page);
        target_nid = mpol_misplaced(page, vma, haddr);
        if (target_nid == -1) {
-               put_page(page);
-               goto clear_pmdnuma;
+               /* If the page was locked, there are no parallel migrations */
+               if (page_locked)
+                       goto clear_pmdnuma;
+
+               /*
+                * Otherwise wait for potential migrations and retry. We do
+                * relock and check_same as the page may no longer be mapped.
+                * As the fault is being retried, do not account for it.
+                */
+               spin_unlock(&mm->page_table_lock);
+               wait_on_page_locked(page);
+               page_nid = -1;
+               goto out;
        }
 
-       /* Acquire the page lock to serialise THP migrations */
+       /* Page is misplaced, serialise migrations and parallel THP splits */
+       get_page(page);
        spin_unlock(&mm->page_table_lock);
-       lock_page(page);
+       if (!page_locked)
+               lock_page(page);
+       anon_vma = page_lock_anon_vma_read(page);
 
        /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp))) {
                unlock_page(page);
                put_page(page);
+               page_nid = -1;
                goto out_unlock;
        }
-       spin_unlock(&mm->page_table_lock);
 
-       /* Migrate the THP to the requested node */
+       /* Bail if we fail to protect against THP splits for any reason */
+       if (unlikely(!anon_vma)) {
+               put_page(page);
+               page_nid = -1;
+               goto clear_pmdnuma;
+       }
+
+       /*
+        * The page_table_lock above provides a memory barrier
+        * with change_protection_range.
+        */
+       if (mm_tlb_flush_pending(mm))
+               flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+       /*
+        * Migrate the THP to the requested node, returns with page unlocked
+        * and pmd_numa cleared.
+        */
+       spin_unlock(&mm->page_table_lock);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
                                pmdp, pmd, addr, page, target_nid);
-       if (!migrated)
-               goto check_same;
+       if (migrated)
+               page_nid = target_nid;
 
-       task_numa_fault(target_nid, HPAGE_PMD_NR, true);
-       return 0;
-
-check_same:
-       spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
-               goto out_unlock;
+       goto out;
 clear_pmdnuma:
+       BUG_ON(!PageLocked(page));
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
+       unlock_page(page);
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (current_nid != -1)
-               task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
+
+       if (page_nid != -1)
+               task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+
        return 0;
 }
 
@@ -1693,21 +1746,24 @@ static int __split_huge_page_map(struct page *page,
        if (pmd) {
                pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
+               if (pmd_write(*pmd))
+                       BUG_ON(page_mapcount(page) != 1);
 
                haddr = address;
                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
+                       /*
+                        * Note that pmd_numa is not transferred deliberately
+                        * to avoid any possibility that pte_numa leaks to
+                        * a PROT_NONE VMA by accident.
+                        */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (!pmd_write(*pmd))
                                entry = pte_wrprotect(entry);
-                       else
-                               BUG_ON(page_mapcount(page) != 1);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
-                       if (pmd_numa(*pmd))
-                               entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -2286,6 +2342,8 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
 
        vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2325,7 +2383,12 @@ static void collapse_huge_page(struct mm_struct *mm,
                pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
-               set_pmd_at(mm, address, pmd, _pmd);
+               /*
+                * We can only use set_pmd_at when establishing
+                * hugepmds and never for establishing regular pmds that
+                * points to regular pagetables. Use pmd_populate for that
+                */
+               pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock_write(vma->anon_vma);
                goto out;
@@ -2692,6 +2755,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 
        mmun_start = haddr;
        mmun_end   = haddr + HPAGE_PMD_SIZE;
+again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2714,7 +2778,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        split_huge_page(page);
 
        put_page(page);
-       BUG_ON(pmd_trans_huge(*pmd));
+
+       /*
+        * We don't always have down_write of mmap_sem here: a racing
+        * do_huge_pmd_wp_page() might have copied-on-write to another
+        * huge page before our split_huge_page() got the anon_vma lock.
+        */
+       if (unlikely(pmd_trans_huge(*pmd)))
+               goto again;
 }
 
 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,