mm: numa: Add THP migration for the NUMA working set scanning fault case.
authorMel Gorman <mgorman@suse.de>
Mon, 19 Nov 2012 12:35:47 +0000 (12:35 +0000)
committerMel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:57 +0000 (14:42 +0000)
Note: This is very heavily based on a patch from Peter Zijlstra with
fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner.  That patch
put a lot of migration logic into mm/huge_memory.c where it does
not belong. This version puts tries to share some of the migration
logic with migrate_misplaced_page.  However, it should be noted
that now migrate.c is doing more with the pagetable manipulation
than is preferred. The end result is barely recognisable so as
before, the signed-offs had to be removed but will be re-added if
the original authors are ok with it.

Add THP migration for the NUMA working set scanning fault case.

It uses the page lock to serialize. No migration pte dance is
necessary because the pte is already unmapped when we decide
to migrate.

[dhillf@gmail.com: Fix memory leak on isolation failure]
[dhillf@gmail.com: Fix transfer of last_nid information]
Signed-off-by: Mel Gorman <mgorman@suse.de>
include/linux/migrate.h
mm/huge_memory.c
mm/internal.h
mm/memcontrol.c
mm/migrate.c

index 91556889adac5ebcef5d916878cc10ce7a943a98..51eac4bdc6067522e98882ca311b24bb17892171 100644 (file)
@@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 extern int migrate_misplaced_page(struct page *page, int node);
 extern int migrate_misplaced_page(struct page *page, int node);
 extern bool migrate_ratelimited(int node);
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                       struct vm_area_struct *vma,
+                       pmd_t *pmd, pmd_t entry,
+                       unsigned long address,
+                       struct page *page, int node);
+
 #else
 static inline int migrate_misplaced_page(struct page *page, int node)
 {
@@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node)
 {
        return false;
 }
+
+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                       struct vm_area_struct *vma,
+                       pmd_t *pmd, pmd_t entry,
+                       unsigned long address,
+                       struct page *page, int node)
+{
+       return -EAGAIN;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* _LINUX_MIGRATE_H */
index 199b261a257e77811a38e9ec9015068ed6d5f733..711baf84b153f36b5f365d1732a1c1a58b031374 100644 (file)
@@ -600,7 +600,7 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd);
@@ -1023,10 +1023,12 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
-       struct page *page = NULL;
+       struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        int target_nid;
        int current_nid = -1;
+       bool migrated;
+       bool page_locked = false;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        page = pmd_page(pmd);
        get_page(page);
-       spin_unlock(&mm->page_table_lock);
        current_nid = page_to_nid(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (current_nid == numa_node_id())
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
        target_nid = mpol_misplaced(page, vma, haddr);
-       if (target_nid == -1)
+       if (target_nid == -1) {
+               put_page(page);
                goto clear_pmdnuma;
+       }
 
-       /*
-        * Due to lacking code to migrate thp pages, we'll split
-        * (which preserves the special PROT_NONE) and re-take the
-        * fault on the normal pages.
-        */
-       split_huge_page(page);
-       put_page(page);
-
-       return 0;
+       /* Acquire the page lock to serialise THP migrations */
+       spin_unlock(&mm->page_table_lock);
+       lock_page(page);
+       page_locked = true;
 
-clear_pmdnuma:
+       /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
+       if (unlikely(!pmd_same(pmd, *pmdp))) {
+               unlock_page(page);
+               put_page(page);
                goto out_unlock;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       /* Migrate the THP to the requested node */
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+                               pmdp, pmd, addr,
+                               page, target_nid);
+       if (migrated)
+               current_nid = target_nid;
+       else {
+               spin_lock(&mm->page_table_lock);
+               if (unlikely(!pmd_same(pmd, *pmdp))) {
+                       unlock_page(page);
+                       goto out_unlock;
+               }
+               goto clear_pmdnuma;
+       }
+
+       task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+       return 0;
 
+clear_pmdnuma:
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
+       if (page_locked)
+               unlock_page(page);
 
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (page) {
-               put_page(page);
-               task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
-       }
+       if (current_nid != -1)
+               task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
        return 0;
 }
 
index a4fa284f6bc213300942e2006b2161c73ffc2af7..7e60ac826f2b09ec3a0e286f88c2e0391ea12074 100644 (file)
@@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
        if (TestClearPageMlocked(page)) {
                unsigned long flags;
+               int nr_pages = hpage_nr_pages(page);
 
                local_irq_save(flags);
-               __dec_zone_page_state(page, NR_MLOCK);
+               __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
-               __inc_zone_page_state(newpage, NR_MLOCK);
+               __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
                local_irq_restore(flags);
        }
 }
 
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
index dd39ba000b31f98730c6fd6d22bdf695ced3bedf..d97af9636ab26b1a19b370d408d1e49ef92e4d0b 100644 (file)
@@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
+       unsigned int nr_pages = 1;
        struct page_cgroup *pc;
        enum charge_type ctype;
 
        *memcgp = NULL;
 
-       VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return;
 
+       if (PageTransHuge(page))
+               nr_pages <<= compound_order(page);
+
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
@@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
         * charged to the res_counter since we plan on replacing the
         * old one and only one page is going to be left afterwards.
         */
-       __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+       __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 
 /* remove redundant charge if migration failed*/
index 2a5ce135eef0304af92ddd675364f2fcbccf8d21..c9400960fd52d78893d54bfe8f5b9548bb3fe4ee 100644 (file)
@@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
  */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-       if (PageHuge(page))
+       if (PageHuge(page) || PageTransHuge(page))
                copy_huge_page(newpage, page);
        else
                copy_highpage(newpage, page);
@@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node)
        return true;
 }
 
-/*
- * Attempt to migrate a misplaced page to the specified destination
- * node. Caller is expected to have an elevated reference count on
- * the page that will be dropped by this function before returning.
- */
-int migrate_misplaced_page(struct page *page, int node)
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat)
 {
-       pg_data_t *pgdat = NODE_DATA(node);
-       int isolated = 0;
-       LIST_HEAD(migratepages);
-
-       /*
-        * Don't migrate pages that are mapped in multiple processes.
-        * TODO: Handle false sharing detection instead of this hammer
-        */
-       if (page_mapcount(page) != 1) {
-               put_page(page);
-               goto out;
-       }
+       bool rate_limited = false;
 
        /*
         * Rate-limit the amount of data that is being migrated to a node.
@@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node)
                pgdat->numabalancing_migrate_next_window = jiffies +
                        msecs_to_jiffies(migrate_interval_millisecs);
        }
-       if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-               spin_unlock(&pgdat->numabalancing_migrate_lock);
-               put_page(page);
-               goto out;
-       }
-       pgdat->numabalancing_migrate_nr_pages++;
+       if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+               rate_limited = true;
+       else
+               pgdat->numabalancing_migrate_nr_pages++;
        spin_unlock(&pgdat->numabalancing_migrate_lock);
+       
+       return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+       int ret = 0;
 
        /* Avoid migrating to a node that is nearly full */
        if (migrate_balanced_pgdat(pgdat, 1)) {
@@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node)
 
                if (isolate_lru_page(page)) {
                        put_page(page);
-                       goto out;
+                       return 0;
                }
-               isolated = 1;
 
+               /* Page is isolated */
+               ret = 1;
                page_lru = page_is_file_cache(page);
-               inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
-               list_add(&page->lru, &migratepages);
+               if (!PageTransHuge(page))
+                       inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+               else
+                       mod_zone_page_state(page_zone(page),
+                                       NR_ISOLATED_ANON + page_lru,
+                                       HPAGE_PMD_NR);
        }
 
        /*
@@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node)
         */
        put_page(page);
 
-       if (isolated) {
-               int nr_remaining;
-
-               nr_remaining = migrate_pages(&migratepages,
-                               alloc_misplaced_dst_page,
-                               node, false, MIGRATE_ASYNC,
-                               MR_NUMA_MISPLACED);
-               if (nr_remaining) {
-                       putback_lru_pages(&migratepages);
-                       isolated = 0;
-               } else
-                       count_vm_numa_event(NUMA_PAGE_MIGRATE);
+       return ret;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+       pg_data_t *pgdat = NODE_DATA(node);
+       int isolated = 0;
+       int nr_remaining;
+       LIST_HEAD(migratepages);
+
+       /*
+        * Don't migrate pages that are mapped in multiple processes.
+        * TODO: Handle false sharing detection instead of this hammer
+        */
+       if (page_mapcount(page) != 1) {
+               put_page(page);
+               goto out;
        }
+
+       /*
+        * Rate-limit the amount of data that is being migrated to a node.
+        * Optimal placement is no good if the memory bus is saturated and
+        * all the time is being spent migrating!
+        */
+       if (numamigrate_update_ratelimit(pgdat)) {
+               put_page(page);
+               goto out;
+       }
+
+       isolated = numamigrate_isolate_page(pgdat, page);
+       if (!isolated)
+               goto out;
+
+       list_add(&page->lru, &migratepages);
+       nr_remaining = migrate_pages(&migratepages,
+                       alloc_misplaced_dst_page,
+                       node, false, MIGRATE_ASYNC,
+                       MR_NUMA_MISPLACED);
+       if (nr_remaining) {
+               putback_lru_pages(&migratepages);
+               isolated = 0;
+       } else
+               count_vm_numa_event(NUMA_PAGE_MIGRATE);
        BUG_ON(!list_empty(&migratepages));
 out:
        return isolated;
 }
+
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               pmd_t *pmd, pmd_t entry,
+                               unsigned long address,
+                               struct page *page, int node)
+{
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       pg_data_t *pgdat = NODE_DATA(node);
+       int isolated = 0;
+       struct page *new_page = NULL;
+       struct mem_cgroup *memcg = NULL;
+       int page_lru = page_is_file_cache(page);
+
+       /*
+        * Don't migrate pages that are mapped in multiple processes.
+        * TODO: Handle false sharing detection instead of this hammer
+        */
+       if (page_mapcount(page) != 1)
+               goto out_dropref;
+
+       /*
+        * Rate-limit the amount of data that is being migrated to a node.
+        * Optimal placement is no good if the memory bus is saturated and
+        * all the time is being spent migrating!
+        */
+       if (numamigrate_update_ratelimit(pgdat))
+               goto out_dropref;
+
+       new_page = alloc_pages_node(node,
+               (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+       if (!new_page)
+               goto out_dropref;
+       page_xchg_last_nid(new_page, page_last_nid(page));
+
+       isolated = numamigrate_isolate_page(pgdat, page);
+       if (!isolated) {
+               put_page(new_page);
+               goto out_keep_locked;
+       }
+
+       /* Prepare a page as a migration target */
+       __set_page_locked(new_page);
+       SetPageSwapBacked(new_page);
+
+       /* anon mapping, we can simply copy page->mapping to the new page: */
+       new_page->mapping = page->mapping;
+       new_page->index = page->index;
+       migrate_page_copy(new_page, page);
+       WARN_ON(PageLRU(new_page));
+
+       /* Recheck the target PMD */
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(*pmd, entry))) {
+               spin_unlock(&mm->page_table_lock);
+
+               /* Reverse changes made by migrate_page_copy() */
+               if (TestClearPageActive(new_page))
+                       SetPageActive(page);
+               if (TestClearPageUnevictable(new_page))
+                       SetPageUnevictable(page);
+               mlock_migrate_page(page, new_page);
+
+               unlock_page(new_page);
+               put_page(new_page);             /* Free it */
+
+               unlock_page(page);
+               putback_lru_page(page);
+
+               count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+               goto out;
+       }
+
+       /*
+        * Traditional migration needs to prepare the memcg charge
+        * transaction early to prevent the old page from being
+        * uncharged when installing migration entries.  Here we can
+        * save the potential rollback and start the charge transfer
+        * only when migration is already known to end successfully.
+        */
+       mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+       entry = mk_pmd(new_page, vma->vm_page_prot);
+       entry = pmd_mknonnuma(entry);
+       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = pmd_mkhuge(entry);
+
+       page_add_new_anon_rmap(new_page, vma, haddr);
+
+       set_pmd_at(mm, haddr, pmd, entry);
+       update_mmu_cache_pmd(vma, address, entry);
+       page_remove_rmap(page);
+       /*
+        * Finish the charge transaction under the page table lock to
+        * prevent split_huge_page() from dividing up the charge
+        * before it's fully transferred to the new page.
+        */
+       mem_cgroup_end_migration(memcg, page, new_page, true);
+       spin_unlock(&mm->page_table_lock);
+
+       unlock_page(new_page);
+       unlock_page(page);
+       put_page(page);                 /* Drop the rmap reference */
+       put_page(page);                 /* Drop the LRU isolation reference */
+
+       count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+       count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+out:
+       mod_zone_page_state(page_zone(page),
+                       NR_ISOLATED_ANON + page_lru,
+                       -HPAGE_PMD_NR);
+       return isolated;
+
+out_dropref:
+       put_page(page);
+out_keep_locked:
+       return 0;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_NUMA */