mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/export.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/nsproxy.h>
  23 #include <linux/pagevec.h>
  24 #include <linux/ksm.h>
  25 #include <linux/rmap.h>
  26 #include <linux/topology.h>
  27 #include <linux/cpu.h>
  28 #include <linux/cpuset.h>
  29 #include <linux/writeback.h>
  30 #include <linux/mempolicy.h>
  31 #include <linux/vmalloc.h>
  32 #include <linux/security.h>
  33 #include <linux/backing-dev.h>
  34 #include <linux/memcontrol.h>
  35 #include <linux/syscalls.h>
  36 #include <linux/hugetlb.h>
  37 #include <linux/hugetlb_cgroup.h>
  38 #include <linux/gfp.h>
  39 #include <linux/balloon_compaction.h>
  40 #include <linux/ptrace.h>
  41
  42 #include <asm/tlbflush.h>
  43
  44 #define CREATE_TRACE_POINTS
  45 #include <trace/events/migrate.h>
  46
  47 #include "internal.h"
  48
  49 /*
  50  * migrate_prep() needs to be called before we start compiling a list of pages
  51  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
  52  * undesirable, use migrate_prep_local()
  53  */
  54 int migrate_prep(void)
  55 {
  56         /*
  57          * Clear the LRU lists so pages can be isolated.
  58          * Note that pages may be moved off the LRU after we have
  59          * drained them. Those pages will fail to migrate like other
  60          * pages that may be busy.
  61          */
  62         lru_add_drain_all();
  63
  64         return 0;
  65 }
  66
  67 /* Do the necessary work of migrate_prep but not if it involves other CPUs */
  68 int migrate_prep_local(void)
  69 {
  70         lru_add_drain();
  71
  72         return 0;
  73 }
  74
  75 /*
  76  * Add isolated pages on the list back to the LRU under page lock
  77  * to avoid leaking evictable pages back onto unevictable list.
  78  */
  79 void putback_lru_pages(struct list_head *l)
  80 {
  81         struct page *page;
  82         struct page *page2;
  83
  84         list_for_each_entry_safe(page, page2, l, lru) {
  85                 list_del(&page->lru);
  86                 dec_zone_page_state(page, NR_ISOLATED_ANON +
  87                                 page_is_file_cache(page));
  88                         putback_lru_page(page);
  89         }
  90 }
  91
  92 /*
  93  * Put previously isolated pages back onto the appropriate lists
  94  * from where they were once taken off for compaction/migration.
  95  *
  96  * This function shall be used instead of putback_lru_pages(),
  97  * whenever the isolated pageset has been built by isolate_migratepages_range()
  98  */
  99 void putback_movable_pages(struct list_head *l)
 100 {
 101         struct page *page;
 102         struct page *page2;
 103
 104         list_for_each_entry_safe(page, page2, l, lru) {
 105                 list_del(&page->lru);
 106                 dec_zone_page_state(page, NR_ISOLATED_ANON +
 107                                 page_is_file_cache(page));
 108                 if (unlikely(isolated_balloon_page(page)))
 109                         balloon_page_putback(page);
 110                 else
 111                         putback_lru_page(page);
 112         }
 113 }
 114
 115 /*
 116  * Restore a potential migration pte to a working pte entry
 117  */
 118 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 119                                  unsigned long addr, void *old)
 120 {
 121         struct mm_struct *mm = vma->vm_mm;
 122         swp_entry_t entry;
 123         pmd_t *pmd;
 124         pte_t *ptep, pte;
 125         spinlock_t *ptl;
 126
 127         if (unlikely(PageHuge(new))) {
 128                 ptep = huge_pte_offset(mm, addr);
 129                 if (!ptep)
 130                         goto out;
 131                 ptl = &mm->page_table_lock;
 132         } else {
 133                 pmd = mm_find_pmd(mm, addr);
 134                 if (!pmd)
 135                         goto out;
 136                 if (pmd_trans_huge(*pmd))
 137                         goto out;
 138
 139                 ptep = pte_offset_map(pmd, addr);
 140
 141                 /*
 142                  * Peek to check is_swap_pte() before taking ptlock?  No, we
 143                  * can race mremap's move_ptes(), which skips anon_vma lock.
 144                  */
 145
 146                 ptl = pte_lockptr(mm, pmd);
 147         }
 148
 149         spin_lock(ptl);
 150         pte = *ptep;
 151         if (!is_swap_pte(pte))
 152                 goto unlock;
 153
 154         entry = pte_to_swp_entry(pte);
 155
 156         if (!is_migration_entry(entry) ||
 157             migration_entry_to_page(entry) != old)
 158                 goto unlock;
 159
 160         get_page(new);
 161         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 162         if (is_write_migration_entry(entry))
 163                 pte = pte_mkwrite(pte);
 164 #ifdef CONFIG_HUGETLB_PAGE
 165         if (PageHuge(new)) {
 166                 pte = pte_mkhuge(pte);
 167                 pte = arch_make_huge_pte(pte, vma, new, 0);
 168         }
 169 #endif
 170         flush_dcache_page(new);
 171         set_pte_at(mm, addr, ptep, pte);
 172
 173         if (PageHuge(new)) {
 174                 if (PageAnon(new))
 175                         hugepage_add_anon_rmap(new, vma, addr);
 176                 else
 177                         page_dup_rmap(new);
 178         } else if (PageAnon(new))
 179                 page_add_anon_rmap(new, vma, addr);
 180         else
 181                 page_add_file_rmap(new);
 182
 183         /* No need to invalidate - it was non-present before */
 184         update_mmu_cache(vma, addr, ptep);
 185 unlock:
 186         pte_unmap_unlock(ptep, ptl);
 187 out:
 188         return SWAP_AGAIN;
 189 }
 190
 191 /*
 192  * Get rid of all migration entries and replace them by
 193  * references to the indicated page.
 194  */
 195 static void remove_migration_ptes(struct page *old, struct page *new)
 196 {
 197         rmap_walk(new, remove_migration_pte, old);
 198 }
 199
 200 /*
 201  * Something used the pte of a page under migration. We need to
 202  * get to the page and wait until migration is finished.
 203  * When we return from this function the fault will be retried.
 204  */
 205 static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 206                                 spinlock_t *ptl)
 207 {
 208         pte_t pte;
 209         swp_entry_t entry;
 210         struct page *page;
 211
 212         spin_lock(ptl);
 213         pte = *ptep;
 214         if (!is_swap_pte(pte))
 215                 goto out;
 216
 217         entry = pte_to_swp_entry(pte);
 218         if (!is_migration_entry(entry))
 219                 goto out;
 220
 221         page = migration_entry_to_page(entry);
 222
 223         /*
 224          * Once radix-tree replacement of page migration started, page_count
 225          * *must* be zero. And, we don't want to call wait_on_page_locked()
 226          * against a page without get_page().
 227          * So, we use get_page_unless_zero(), here. Even failed, page fault
 228          * will occur again.
 229          */
 230         if (!get_page_unless_zero(page))
 231                 goto out;
 232         pte_unmap_unlock(ptep, ptl);
 233         wait_on_page_locked(page);
 234         put_page(page);
 235         return;
 236 out:
 237         pte_unmap_unlock(ptep, ptl);
 238 }
 239
 240 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 241                                 unsigned long address)
 242 {
 243         spinlock_t *ptl = pte_lockptr(mm, pmd);
 244         pte_t *ptep = pte_offset_map(pmd, address);
 245         __migration_entry_wait(mm, ptep, ptl);
 246 }
 247
 248 void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
 249 {
 250         spinlock_t *ptl = &(mm)->page_table_lock;
 251         __migration_entry_wait(mm, pte, ptl);
 252 }
 253
 254 #ifdef CONFIG_BLOCK
 255 /* Returns true if all buffers are successfully locked */
 256 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 257                                                         enum migrate_mode mode)
 258 {
 259         struct buffer_head *bh = head;
 260
 261         /* Simple case, sync compaction */
 262         if (mode != MIGRATE_ASYNC) {
 263                 do {
 264                         get_bh(bh);
 265                         lock_buffer(bh);
 266                         bh = bh->b_this_page;
 267
 268                 } while (bh != head);
 269
 270                 return true;
 271         }
 272
 273         /* async case, we cannot block on lock_buffer so use trylock_buffer */
 274         do {
 275                 get_bh(bh);
 276                 if (!trylock_buffer(bh)) {
 277                         /*
 278                          * We failed to lock the buffer and cannot stall in
 279                          * async migration. Release the taken locks
 280                          */
 281                         struct buffer_head *failed_bh = bh;
 282                         put_bh(failed_bh);
 283                         bh = head;
 284                         while (bh != failed_bh) {
 285                                 unlock_buffer(bh);
 286                                 put_bh(bh);
 287                                 bh = bh->b_this_page;
 288                         }
 289                         return false;
 290                 }
 291
 292                 bh = bh->b_this_page;
 293         } while (bh != head);
 294         return true;
 295 }
 296 #else
 297 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 298                                                         enum migrate_mode mode)
 299 {
 300         return true;
 301 }
 302 #endif /* CONFIG_BLOCK */
 303
 304 /*
 305  * Replace the page in the mapping.
 306  *
 307  * The number of remaining references must be:
 308  * 1 for anonymous pages without a mapping
 309  * 2 for pages with a mapping
 310  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 311  */
 312 int migrate_page_move_mapping(struct address_space *mapping,
 313                 struct page *newpage, struct page *page,
 314                 struct buffer_head *head, enum migrate_mode mode)
 315 {
 316         struct zone *oldzone, *newzone;
 317         int dirty;
 318         int expected_count = 0;
 319         void **pslot;
 320
 321         if (!mapping) {
 322                 /* Anonymous page without mapping */
 323                 if (page_count(page) != 1)
 324                         return -EAGAIN;
 325                 return MIGRATEPAGE_SUCCESS;
 326         }
 327
 328         oldzone = page_zone(page);
 329         newzone = page_zone(newpage);
 330
 331         spin_lock_irq(&mapping->tree_lock);
 332
 333         pslot = radix_tree_lookup_slot(&mapping->page_tree,
 334                                         page_index(page));
 335
 336         expected_count = 2 + page_has_private(page);
 337         if (page_count(page) != expected_count ||
 338                 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 339                 spin_unlock_irq(&mapping->tree_lock);
 340                 return -EAGAIN;
 341         }
 342
 343         if (!page_freeze_refs(page, expected_count)) {
 344                 spin_unlock_irq(&mapping->tree_lock);
 345                 return -EAGAIN;
 346         }
 347
 348         /*
 349          * In the async migration case of moving a page with buffers, lock the
 350          * buffers using trylock before the mapping is moved. If the mapping
 351          * was moved, we later failed to lock the buffers and could not move
 352          * the mapping back due to an elevated page count, we would have to
 353          * block waiting on other references to be dropped.
 354          */
 355         if (mode == MIGRATE_ASYNC && head &&
 356                         !buffer_migrate_lock_buffers(head, mode)) {
 357                 page_unfreeze_refs(page, expected_count);
 358                 spin_unlock_irq(&mapping->tree_lock);
 359                 return -EAGAIN;
 360         }
 361
 362         /*
 363          * Now we know that no one else is looking at the page.
 364          */
 365         get_page(newpage);      /* add cache reference */
 366         if (PageSwapCache(page)) {
 367                 SetPageSwapCache(newpage);
 368                 set_page_private(newpage, page_private(page));
 369         }
 370
 371         /* Move dirty while page refs frozen and newpage not yet exposed */
 372         dirty = PageDirty(page);
 373         if (dirty) {
 374                 ClearPageDirty(page);
 375                 SetPageDirty(newpage);
 376         }
 377
 378         radix_tree_replace_slot(pslot, newpage);
 379
 380         /*
 381          * Drop cache reference from old page by unfreezing
 382          * to one less reference.
 383          * We know this isn't the last reference.
 384          */
 385         page_unfreeze_refs(page, expected_count - 1);
 386
 387         spin_unlock(&mapping->tree_lock);
 388         /* Leave irq disabled to prevent preemption while updating stats */
 389
 390         /*
 391          * If moved to a different zone then also account
 392          * the page for that zone. Other VM counters will be
 393          * taken care of when we establish references to the
 394          * new page and drop references to the old page.
 395          *
 396          * Note that anonymous pages are accounted for
 397          * via NR_FILE_PAGES and NR_ANON_PAGES if they
 398          * are mapped to swap space.
 399          */
 400         if (newzone != oldzone) {
 401                 __dec_zone_state(oldzone, NR_FILE_PAGES);
 402                 __inc_zone_state(newzone, NR_FILE_PAGES);
 403                 if (PageSwapBacked(page) && !PageSwapCache(page)) {
 404                         __dec_zone_state(oldzone, NR_SHMEM);
 405                         __inc_zone_state(newzone, NR_SHMEM);
 406                 }
 407                 if (dirty && mapping_cap_account_dirty(mapping)) {
 408                         __dec_zone_state(oldzone, NR_FILE_DIRTY);
 409                         __inc_zone_state(newzone, NR_FILE_DIRTY);
 410                 }
 411         }
 412         local_irq_enable();
 413
 414         return MIGRATEPAGE_SUCCESS;
 415 }
 416 EXPORT_SYMBOL(migrate_page_move_mapping);
 417
 418 /*
 419  * The expected number of remaining references is the same as that
 420  * of migrate_page_move_mapping().
 421  */
 422 int migrate_huge_page_move_mapping(struct address_space *mapping,
 423                                    struct page *newpage, struct page *page)
 424 {
 425         int expected_count;
 426         void **pslot;
 427
 428         if (!mapping) {
 429                 if (page_count(page) != 1)
 430                         return -EAGAIN;
 431                 return MIGRATEPAGE_SUCCESS;
 432         }
 433
 434         spin_lock_irq(&mapping->tree_lock);
 435
 436         pslot = radix_tree_lookup_slot(&mapping->page_tree,
 437                                         page_index(page));
 438
 439         expected_count = 2 + page_has_private(page);
 440         if (page_count(page) != expected_count ||
 441                 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
 442                 spin_unlock_irq(&mapping->tree_lock);
 443                 return -EAGAIN;
 444         }
 445
 446         if (!page_freeze_refs(page, expected_count)) {
 447                 spin_unlock_irq(&mapping->tree_lock);
 448                 return -EAGAIN;
 449         }
 450
 451         get_page(newpage);
 452
 453         radix_tree_replace_slot(pslot, newpage);
 454
 455         page_unfreeze_refs(page, expected_count - 1);
 456
 457         spin_unlock_irq(&mapping->tree_lock);
 458         return MIGRATEPAGE_SUCCESS;
 459 }
 460
 461 /*
 462  * Copy the page to its new location
 463  */
 464 void migrate_page_copy(struct page *newpage, struct page *page)
 465 {
 466         if (PageHuge(page) || PageTransHuge(page))
 467                 copy_huge_page(newpage, page);
 468         else
 469                 copy_highpage(newpage, page);
 470
 471         if (PageError(page))
 472                 SetPageError(newpage);
 473         if (PageReferenced(page))
 474                 SetPageReferenced(newpage);
 475         if (PageUptodate(page))
 476                 SetPageUptodate(newpage);
 477         if (TestClearPageActive(page)) {
 478                 VM_BUG_ON(PageUnevictable(page));
 479                 SetPageActive(newpage);
 480         } else if (TestClearPageUnevictable(page))
 481                 SetPageUnevictable(newpage);
 482         if (PageChecked(page))
 483                 SetPageChecked(newpage);
 484         if (PageMappedToDisk(page))
 485                 SetPageMappedToDisk(newpage);
 486
 487         /* Move dirty on pages not done by migrate_page_move_mapping() */
 488         if (PageDirty(page))
 489                 SetPageDirty(newpage);
 490
 491         mlock_migrate_page(newpage, page);
 492         ksm_migrate_page(newpage, page);
 493         /*
 494          * Please do not reorder this without considering how mm/ksm.c's
 495          * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
 496          */
 497         ClearPageSwapCache(page);
 498         ClearPagePrivate(page);
 499         set_page_private(page, 0);
 500
 501         /*
 502          * If any waiters have accumulated on the new page then
 503          * wake them up.
 504          */
 505         if (PageWriteback(newpage))
 506                 end_page_writeback(newpage);
 507 }
 508 EXPORT_SYMBOL(migrate_page_copy);
 509
 510 /************************************************************
 511  *                    Migration functions
 512  ***********************************************************/
 513
 514 /* Always fail migration. Used for mappings that are not movable */
 515 int fail_migrate_page(struct address_space *mapping,
 516                         struct page *newpage, struct page *page)
 517 {
 518         return -EIO;
 519 }
 520 EXPORT_SYMBOL(fail_migrate_page);
 521
 522 /*
 523  * Common logic to directly migrate a single page suitable for
 524  * pages that do not use PagePrivate/PagePrivate2.
 525  *
 526  * Pages are locked upon entry and exit.
 527  */
 528 int migrate_page(struct address_space *mapping,
 529                 struct page *newpage, struct page *page,
 530                 enum migrate_mode mode)
 531 {
 532         int rc;
 533
 534         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 535
 536         rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
 537
 538         if (rc != MIGRATEPAGE_SUCCESS)
 539                 return rc;
 540
 541         migrate_page_copy(newpage, page);
 542         return MIGRATEPAGE_SUCCESS;
 543 }
 544 EXPORT_SYMBOL(migrate_page);
 545
 546 #ifdef CONFIG_BLOCK
 547 /*
 548  * Migration function for pages with buffers. This function can only be used
 549  * if the underlying filesystem guarantees that no other references to "page"
 550  * exist.
 551  */
 552 int buffer_migrate_page(struct address_space *mapping,
 553                 struct page *newpage, struct page *page, enum migrate_mode mode)
 554 {
 555         struct buffer_head *bh, *head;
 556         int rc;
 557
 558         if (!page_has_buffers(page))
 559                 return migrate_page(mapping, newpage, page, mode);
 560
 561         head = page_buffers(page);
 562
 563         rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
 564
 565         if (rc != MIGRATEPAGE_SUCCESS)
 566                 return rc;
 567
 568         /*
 569          * In the async case, migrate_page_move_mapping locked the buffers
 570          * with an IRQ-safe spinlock held. In the sync case, the buffers
 571          * need to be locked now
 572          */
 573         if (mode != MIGRATE_ASYNC)
 574                 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 575
 576         ClearPagePrivate(page);
 577         set_page_private(newpage, page_private(page));
 578         set_page_private(page, 0);
 579         put_page(page);
 580         get_page(newpage);
 581
 582         bh = head;
 583         do {
 584                 set_bh_page(bh, newpage, bh_offset(bh));
 585                 bh = bh->b_this_page;
 586
 587         } while (bh != head);
 588
 589         SetPagePrivate(newpage);
 590
 591         migrate_page_copy(newpage, page);
 592
 593         bh = head;
 594         do {
 595                 unlock_buffer(bh);
 596                 put_bh(bh);
 597                 bh = bh->b_this_page;
 598
 599         } while (bh != head);
 600
 601         return MIGRATEPAGE_SUCCESS;
 602 }
 603 EXPORT_SYMBOL(buffer_migrate_page);
 604 #endif
 605
 606 /*
 607  * Writeback a page to clean the dirty state
 608  */
 609 static int writeout(struct address_space *mapping, struct page *page)
 610 {
 611         struct writeback_control wbc = {
 612                 .sync_mode = WB_SYNC_NONE,
 613                 .nr_to_write = 1,
 614                 .range_start = 0,
 615                 .range_end = LLONG_MAX,
 616                 .for_reclaim = 1
 617         };
 618         int rc;
 619
 620         if (!mapping->a_ops->writepage)
 621                 /* No write method for the address space */
 622                 return -EINVAL;
 623
 624         if (!clear_page_dirty_for_io(page))
 625                 /* Someone else already triggered a write */
 626                 return -EAGAIN;
 627
 628         /*
 629          * A dirty page may imply that the underlying filesystem has
 630          * the page on some queue. So the page must be clean for
 631          * migration. Writeout may mean we loose the lock and the
 632          * page state is no longer what we checked for earlier.
 633          * At this point we know that the migration attempt cannot
 634          * be successful.
 635          */
 636         remove_migration_ptes(page, page);
 637
 638         rc = mapping->a_ops->writepage(page, &wbc);
 639
 640         if (rc != AOP_WRITEPAGE_ACTIVATE)
 641                 /* unlocked. Relock */
 642                 lock_page(page);
 643
 644         return (rc < 0) ? -EIO : -EAGAIN;
 645 }
 646
 647 /*
 648  * Default handling if a filesystem does not provide a migration function.
 649  */
 650 static int fallback_migrate_page(struct address_space *mapping,
 651         struct page *newpage, struct page *page, enum migrate_mode mode)
 652 {
 653         if (PageDirty(page)) {
 654                 /* Only writeback pages in full synchronous migration */
 655                 if (mode != MIGRATE_SYNC)
 656                         return -EBUSY;
 657                 return writeout(mapping, page);
 658         }
 659
 660         /*
 661          * Buffers may be managed in a filesystem specific way.
 662          * We must have no buffers or drop them.
 663          */
 664         if (page_has_private(page) &&
 665             !try_to_release_page(page, GFP_KERNEL))
 666                 return -EAGAIN;
 667
 668         return migrate_page(mapping, newpage, page, mode);
 669 }
 670
 671 /*
 672  * Move a page to a newly allocated page
 673  * The page is locked and all ptes have been successfully removed.
 674  *
 675  * The new page will have replaced the old page if this function
 676  * is successful.
 677  *
 678  * Return value:
 679  *   < 0 - error code
 680  *  MIGRATEPAGE_SUCCESS - success
 681  */
 682 static int move_to_new_page(struct page *newpage, struct page *page,
 683                                 int remap_swapcache, enum migrate_mode mode)
 684 {
 685         struct address_space *mapping;
 686         int rc;
 687
 688         /*
 689          * Block others from accessing the page when we get around to
 690          * establishing additional references. We are the only one
 691          * holding a reference to the new page at this point.
 692          */
 693         if (!trylock_page(newpage))
 694                 BUG();
 695
 696         /* Prepare mapping for the new page.*/
 697         newpage->index = page->index;
 698         newpage->mapping = page->mapping;
 699         if (PageSwapBacked(page))
 700                 SetPageSwapBacked(newpage);
 701
 702         mapping = page_mapping(page);
 703         if (!mapping)
 704                 rc = migrate_page(mapping, newpage, page, mode);
 705         else if (mapping->a_ops->migratepage)
 706                 /*
 707                  * Most pages have a mapping and most filesystems provide a
 708                  * migratepage callback. Anonymous pages are part of swap
 709                  * space which also has its own migratepage callback. This
 710                  * is the most common path for page migration.
 711                  */
 712                 rc = mapping->a_ops->migratepage(mapping,
 713                                                 newpage, page, mode);
 714         else
 715                 rc = fallback_migrate_page(mapping, newpage, page, mode);
 716
 717         if (rc != MIGRATEPAGE_SUCCESS) {
 718                 newpage->mapping = NULL;
 719         } else {
 720                 if (remap_swapcache)
 721                         remove_migration_ptes(page, newpage);
 722                 page->mapping = NULL;
 723         }
 724
 725         unlock_page(newpage);
 726
 727         return rc;
 728 }
 729
 730 static int __unmap_and_move(struct page *page, struct page *newpage,
 731                                 int force, enum migrate_mode mode)
 732 {
 733         int rc = -EAGAIN;
 734         int remap_swapcache = 1;
 735         struct mem_cgroup *mem;
 736         struct anon_vma *anon_vma = NULL;
 737
 738         if (!trylock_page(page)) {
 739                 if (!force || mode == MIGRATE_ASYNC)
 740                         goto out;
 741
 742                 /*
 743                  * It's not safe for direct compaction to call lock_page.
 744                  * For example, during page readahead pages are added locked
 745                  * to the LRU. Later, when the IO completes the pages are
 746                  * marked uptodate and unlocked. However, the queueing
 747                  * could be merging multiple pages for one bio (e.g.
 748                  * mpage_readpages). If an allocation happens for the
 749                  * second or third page, the process can end up locking
 750                  * the same page twice and deadlocking. Rather than
 751                  * trying to be clever about what pages can be locked,
 752                  * avoid the use of lock_page for direct compaction
 753                  * altogether.
 754                  */
 755                 if (current->flags & PF_MEMALLOC)
 756                         goto out;
 757
 758                 lock_page(page);
 759         }
 760
 761         /* charge against new page */
 762         mem_cgroup_prepare_migration(page, newpage, &mem);
 763
 764         if (PageWriteback(page)) {
 765                 /*
 766                  * Only in the case of a full synchronous migration is it
 767                  * necessary to wait for PageWriteback. In the async case,
 768                  * the retry loop is too short and in the sync-light case,
 769                  * the overhead of stalling is too much
 770                  */
 771                 if (mode != MIGRATE_SYNC) {
 772                         rc = -EBUSY;
 773                         goto uncharge;
 774                 }
 775                 if (!force)
 776                         goto uncharge;
 777                 wait_on_page_writeback(page);
 778         }
 779         /*
 780          * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 781          * we cannot notice that anon_vma is freed while we migrates a page.
 782          * This get_anon_vma() delays freeing anon_vma pointer until the end
 783          * of migration. File cache pages are no problem because of page_lock()
 784          * File Caches may use write_page() or lock_page() in migration, then,
 785          * just care Anon page here.
 786          */
 787         if (PageAnon(page) && !PageKsm(page)) {
 788                 /*
 789                  * Only page_lock_anon_vma_read() understands the subtleties of
 790                  * getting a hold on an anon_vma from outside one of its mms.
 791                  */
 792                 anon_vma = page_get_anon_vma(page);
 793                 if (anon_vma) {
 794                         /*
 795                          * Anon page
 796                          */
 797                 } else if (PageSwapCache(page)) {
 798                         /*
 799                          * We cannot be sure that the anon_vma of an unmapped
 800                          * swapcache page is safe to use because we don't
 801                          * know in advance if the VMA that this page belonged
 802                          * to still exists. If the VMA and others sharing the
 803                          * data have been freed, then the anon_vma could
 804                          * already be invalid.
 805                          *
 806                          * To avoid this possibility, swapcache pages get
 807                          * migrated but are not remapped when migration
 808                          * completes
 809                          */
 810                         remap_swapcache = 0;
 811                 } else {
 812                         goto uncharge;
 813                 }
 814         }
 815
 816         if (unlikely(balloon_page_movable(page))) {
 817                 /*
 818                  * A ballooned page does not need any special attention from
 819                  * physical to virtual reverse mapping procedures.
 820                  * Skip any attempt to unmap PTEs or to remap swap cache,
 821                  * in order to avoid burning cycles at rmap level, and perform
 822                  * the page migration right away (proteced by page lock).
 823                  */
 824                 rc = balloon_page_migrate(newpage, page, mode);
 825                 goto uncharge;
 826         }
 827
 828         /*
 829          * Corner case handling:
 830          * 1. When a new swap-cache page is read into, it is added to the LRU
 831          * and treated as swapcache but it has no rmap yet.
 832          * Calling try_to_unmap() against a page->mapping==NULL page will
 833          * trigger a BUG.  So handle it here.
 834          * 2. An orphaned page (see truncate_complete_page) might have
 835          * fs-private metadata. The page can be picked up due to memory
 836          * offlining.  Everywhere else except page reclaim, the page is
 837          * invisible to the vm, so the page can not be migrated.  So try to
 838          * free the metadata, so the page can be freed.
 839          */
 840         if (!page->mapping) {
 841                 VM_BUG_ON(PageAnon(page));
 842                 if (page_has_private(page)) {
 843                         try_to_free_buffers(page);
 844                         goto uncharge;
 845                 }
 846                 goto skip_unmap;
 847         }
 848
 849         /* Establish migration ptes or remove ptes */
 850         try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 851
 852 skip_unmap:
 853         if (!page_mapped(page))
 854                 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
 855
 856         if (rc && remap_swapcache)
 857                 remove_migration_ptes(page, page);
 858
 859         /* Drop an anon_vma reference if we took one */
 860         if (anon_vma)
 861                 put_anon_vma(anon_vma);
 862
 863 uncharge:
 864         mem_cgroup_end_migration(mem, page, newpage,
 865                                  (rc == MIGRATEPAGE_SUCCESS ||
 866                                   rc == MIGRATEPAGE_BALLOON_SUCCESS));
 867         unlock_page(page);
 868 out:
 869         return rc;
 870 }
 871
 872 /*
 873  * Obtain the lock on page, remove all ptes and migrate the page
 874  * to the newly allocated page in newpage.
 875  */
 876 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 877                         struct page *page, int force, enum migrate_mode mode)
 878 {
 879         int rc = 0;
 880         int *result = NULL;
 881         struct page *newpage = get_new_page(page, private, &result);
 882
 883         if (!newpage)
 884                 return -ENOMEM;
 885
 886         if (page_count(page) == 1) {
 887                 /* page was freed from under us. So we are done. */
 888                 goto out;
 889         }
 890
 891         if (unlikely(PageTransHuge(page)))
 892                 if (unlikely(split_huge_page(page)))
 893                         goto out;
 894
 895         rc = __unmap_and_move(page, newpage, force, mode);
 896
 897         if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
 898                 /*
 899                  * A ballooned page has been migrated already.
 900                  * Now, it's the time to wrap-up counters,
 901                  * handle the page back to Buddy and return.
 902                  */
 903                 dec_zone_page_state(page, NR_ISOLATED_ANON +
 904                                     page_is_file_cache(page));
 905                 balloon_page_free(page);
 906                 return MIGRATEPAGE_SUCCESS;
 907         }
 908 out:
 909         if (rc != -EAGAIN) {
 910                 /*
 911                  * A page that has been migrated has all references
 912                  * removed and will be freed. A page that has not been
 913                  * migrated will have kepts its references and be
 914                  * restored.
 915                  */
 916                 list_del(&page->lru);
 917                 dec_zone_page_state(page, NR_ISOLATED_ANON +
 918                                 page_is_file_cache(page));
 919                 putback_lru_page(page);
 920         }
 921         /*
 922          * Move the new page to the LRU. If migration was not successful
 923          * then this will free the page.
 924          */
 925         putback_lru_page(newpage);
 926         if (result) {
 927                 if (rc)
 928                         *result = rc;
 929                 else
 930                         *result = page_to_nid(newpage);
 931         }
 932         return rc;
 933 }
 934
 935 /*
 936  * Counterpart of unmap_and_move_page() for hugepage migration.
 937  *
 938  * This function doesn't wait the completion of hugepage I/O
 939  * because there is no race between I/O and migration for hugepage.
 940  * Note that currently hugepage I/O occurs only in direct I/O
 941  * where no lock is held and PG_writeback is irrelevant,
 942  * and writeback status of all subpages are counted in the reference
 943  * count of the head page (i.e. if all subpages of a 2MB hugepage are
 944  * under direct I/O, the reference of the head page is 512 and a bit more.)
 945  * This means that when we try to migrate hugepage whose subpages are
 946  * doing direct I/O, some references remain after try_to_unmap() and
 947  * hugepage migration fails without data corruption.
 948  *
 949  * There is also no race when direct I/O is issued on the page under migration,
 950  * because then pte is replaced with migration swap entry and direct I/O code
 951  * will wait in the page fault for migration to complete.
 952  */
 953 static int unmap_and_move_huge_page(new_page_t get_new_page,
 954                                 unsigned long private, struct page *hpage,
 955                                 int force, enum migrate_mode mode)
 956 {
 957         int rc = 0;
 958         int *result = NULL;
 959         struct page *new_hpage = get_new_page(hpage, private, &result);
 960         struct anon_vma *anon_vma = NULL;
 961
 962         if (!new_hpage)
 963                 return -ENOMEM;
 964
 965         rc = -EAGAIN;
 966
 967         if (!trylock_page(hpage)) {
 968                 if (!force || mode != MIGRATE_SYNC)
 969                         goto out;
 970                 lock_page(hpage);
 971         }
 972
 973         if (PageAnon(hpage))
 974                 anon_vma = page_get_anon_vma(hpage);
 975
 976         try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 977
 978         if (!page_mapped(hpage))
 979                 rc = move_to_new_page(new_hpage, hpage, 1, mode);
 980
 981         if (rc)
 982                 remove_migration_ptes(hpage, hpage);
 983
 984         if (anon_vma)
 985                 put_anon_vma(anon_vma);
 986
 987         if (!rc)
 988                 hugetlb_cgroup_migrate(hpage, new_hpage);
 989
 990         unlock_page(hpage);
 991 out:
 992         put_page(new_hpage);
 993         if (result) {
 994                 if (rc)
 995                         *result = rc;
 996                 else
 997                         *result = page_to_nid(new_hpage);
 998         }
 999         return rc;
1000 }
1001
1002 /*
1003  * migrate_pages - migrate the pages specified in a list, to the free pages
1004  *                 supplied as the target for the page migration
1005  *
1006  * @from:               The list of pages to be migrated.
1007  * @get_new_page:       The function used to allocate free pages to be used
1008  *                      as the target of the page migration.
1009  * @private:            Private data to be passed on to get_new_page()
1010  * @mode:               The migration mode that specifies the constraints for
1011  *                      page migration, if any.
1012  * @reason:             The reason for page migration.
1013  *
1014  * The function returns after 10 attempts or if no pages are movable any more
1015  * because the list has become empty or no retryable pages exist any more.
1016  * The caller should call putback_lru_pages() to return pages to the LRU
1017  * or free list only if ret != 0.
1018  *
1019  * Returns the number of pages that were not migrated, or an error code.
1020  */
1021 int migrate_pages(struct list_head *from, new_page_t get_new_page,
1022                 unsigned long private, enum migrate_mode mode, int reason)
1023 {
1024         int retry = 1;
1025         int nr_failed = 0;
1026         int nr_succeeded = 0;
1027         int pass = 0;
1028         struct page *page;
1029         struct page *page2;
1030         int swapwrite = current->flags & PF_SWAPWRITE;
1031         int rc;
1032
1033         if (!swapwrite)
1034                 current->flags |= PF_SWAPWRITE;
1035
1036         for(pass = 0; pass < 10 && retry; pass++) {
1037                 retry = 0;
1038
1039                 list_for_each_entry_safe(page, page2, from, lru) {
1040                         cond_resched();
1041
1042                         rc = unmap_and_move(get_new_page, private,
1043                                                 page, pass > 2, mode);
1044
1045                         switch(rc) {
1046                         case -ENOMEM:
1047                                 goto out;
1048                         case -EAGAIN:
1049                                 retry++;
1050                                 break;
1051                         case MIGRATEPAGE_SUCCESS:
1052                                 nr_succeeded++;
1053                                 break;
1054                         default:
1055                                 /* Permanent failure */
1056                                 nr_failed++;
1057                                 break;
1058                         }
1059                 }
1060         }
1061         rc = nr_failed + retry;
1062 out:
1063         if (nr_succeeded)
1064                 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1065         if (nr_failed)
1066                 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1067         trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1068
1069         if (!swapwrite)
1070                 current->flags &= ~PF_SWAPWRITE;
1071
1072         return rc;
1073 }
1074
1075 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1076                       unsigned long private, enum migrate_mode mode)
1077 {
1078         int pass, rc;
1079
1080         for (pass = 0; pass < 10; pass++) {
1081                 rc = unmap_and_move_huge_page(get_new_page, private,
1082                                                 hpage, pass > 2, mode);
1083                 switch (rc) {
1084                 case -ENOMEM:
1085                         goto out;
1086                 case -EAGAIN:
1087                         /* try again */
1088                         cond_resched();
1089                         break;
1090                 case MIGRATEPAGE_SUCCESS:
1091                         goto out;
1092                 default:
1093                         rc = -EIO;
1094                         goto out;
1095                 }
1096         }
1097 out:
1098         return rc;
1099 }
1100
1101 #ifdef CONFIG_NUMA
1102 /*
1103  * Move a list of individual pages
1104  */
1105 struct page_to_node {
1106         unsigned long addr;
1107         struct page *page;
1108         int node;
1109         int status;
1110 };
1111
1112 static struct page *new_page_node(struct page *p, unsigned long private,
1113                 int **result)
1114 {
1115         struct page_to_node *pm = (struct page_to_node *)private;
1116
1117         while (pm->node != MAX_NUMNODES && pm->page != p)
1118                 pm++;
1119
1120         if (pm->node == MAX_NUMNODES)
1121                 return NULL;
1122
1123         *result = &pm->status;
1124
1125         return alloc_pages_exact_node(pm->node,
1126                                 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
1127 }
1128
1129 /*
1130  * Move a set of pages as indicated in the pm array. The addr
1131  * field must be set to the virtual address of the page to be moved
1132  * and the node number must contain a valid target node.
1133  * The pm array ends with node = MAX_NUMNODES.
1134  */
1135 static int do_move_page_to_node_array(struct mm_struct *mm,
1136                                       struct page_to_node *pm,
1137                                       int migrate_all)
1138 {
1139         int err;
1140         struct page_to_node *pp;
1141         LIST_HEAD(pagelist);
1142
1143         down_read(&mm->mmap_sem);
1144
1145         /*
1146          * Build a list of pages to migrate
1147          */
1148         for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1149                 struct vm_area_struct *vma;
1150                 struct page *page;
1151
1152                 err = -EFAULT;
1153                 vma = find_vma(mm, pp->addr);
1154                 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1155                         goto set_status;
1156
1157                 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1158
1159                 err = PTR_ERR(page);
1160                 if (IS_ERR(page))
1161                         goto set_status;
1162
1163                 err = -ENOENT;
1164                 if (!page)
1165                         goto set_status;
1166
1167                 /* Use PageReserved to check for zero page */
1168                 if (PageReserved(page))
1169                         goto put_and_set;
1170
1171                 pp->page = page;
1172                 err = page_to_nid(page);
1173
1174                 if (err == pp->node)
1175                         /*
1176                          * Node already in the right place
1177                          */
1178                         goto put_and_set;
1179
1180                 err = -EACCES;
1181                 if (page_mapcount(page) > 1 &&
1182                                 !migrate_all)
1183                         goto put_and_set;
1184
1185                 err = isolate_lru_page(page);
1186                 if (!err) {
1187                         list_add_tail(&page->lru, &pagelist);
1188                         inc_zone_page_state(page, NR_ISOLATED_ANON +
1189                                             page_is_file_cache(page));
1190                 }
1191 put_and_set:
1192                 /*
1193                  * Either remove the duplicate refcount from
1194                  * isolate_lru_page() or drop the page ref if it was
1195                  * not isolated.
1196                  */
1197                 put_page(page);
1198 set_status:
1199                 pp->status = err;
1200         }
1201
1202         err = 0;
1203         if (!list_empty(&pagelist)) {
1204                 err = migrate_pages(&pagelist, new_page_node,
1205                                 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1206                 if (err)
1207                         putback_lru_pages(&pagelist);
1208         }
1209
1210         up_read(&mm->mmap_sem);
1211         return err;
1212 }
1213
1214 /*
1215  * Migrate an array of page address onto an array of nodes and fill
1216  * the corresponding array of status.
1217  */
1218 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1219                          unsigned long nr_pages,
1220                          const void __user * __user *pages,
1221                          const int __user *nodes,
1222                          int __user *status, int flags)
1223 {
1224         struct page_to_node *pm;
1225         unsigned long chunk_nr_pages;
1226         unsigned long chunk_start;
1227         int err;
1228
1229         err = -ENOMEM;
1230         pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1231         if (!pm)
1232                 goto out;
1233
1234         migrate_prep();
1235
1236         /*
1237          * Store a chunk of page_to_node array in a page,
1238          * but keep the last one as a marker
1239          */
1240         chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1241
1242         for (chunk_start = 0;
1243              chunk_start < nr_pages;
1244              chunk_start += chunk_nr_pages) {
1245                 int j;
1246
1247                 if (chunk_start + chunk_nr_pages > nr_pages)
1248                         chunk_nr_pages = nr_pages - chunk_start;
1249
1250                 /* fill the chunk pm with addrs and nodes from user-space */
1251                 for (j = 0; j < chunk_nr_pages; j++) {
1252                         const void __user *p;
1253                         int node;
1254
1255                         err = -EFAULT;
1256                         if (get_user(p, pages + j + chunk_start))
1257                                 goto out_pm;
1258                         pm[j].addr = (unsigned long) p;
1259
1260                         if (get_user(node, nodes + j + chunk_start))
1261                                 goto out_pm;
1262
1263                         err = -ENODEV;
1264                         if (node < 0 || node >= MAX_NUMNODES)
1265                                 goto out_pm;
1266
1267                         if (!node_state(node, N_MEMORY))
1268                                 goto out_pm;
1269
1270                         err = -EACCES;
1271                         if (!node_isset(node, task_nodes))
1272                                 goto out_pm;
1273
1274                         pm[j].node = node;
1275                 }
1276
1277                 /* End marker for this chunk */
1278                 pm[chunk_nr_pages].node = MAX_NUMNODES;
1279
1280                 /* Migrate this chunk */
1281                 err = do_move_page_to_node_array(mm, pm,
1282                                                  flags & MPOL_MF_MOVE_ALL);
1283                 if (err < 0)
1284                         goto out_pm;
1285
1286                 /* Return status information */
1287                 for (j = 0; j < chunk_nr_pages; j++)
1288                         if (put_user(pm[j].status, status + j + chunk_start)) {
1289                                 err = -EFAULT;
1290                                 goto out_pm;
1291                         }
1292         }
1293         err = 0;
1294
1295 out_pm:
1296         free_page((unsigned long)pm);
1297 out:
1298         return err;
1299 }
1300
1301 /*
1302  * Determine the nodes of an array of pages and store it in an array of status.
1303  */
1304 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1305                                 const void __user **pages, int *status)
1306 {
1307         unsigned long i;
1308
1309         down_read(&mm->mmap_sem);
1310
1311         for (i = 0; i < nr_pages; i++) {
1312                 unsigned long addr = (unsigned long)(*pages);
1313                 struct vm_area_struct *vma;
1314                 struct page *page;
1315                 int err = -EFAULT;
1316
1317                 vma = find_vma(mm, addr);
1318                 if (!vma || addr < vma->vm_start)
1319                         goto set_status;
1320
1321                 page = follow_page(vma, addr, 0);
1322
1323                 err = PTR_ERR(page);
1324                 if (IS_ERR(page))
1325                         goto set_status;
1326
1327                 err = -ENOENT;
1328                 /* Use PageReserved to check for zero page */
1329                 if (!page || PageReserved(page))
1330                         goto set_status;
1331
1332                 err = page_to_nid(page);
1333 set_status:
1334                 *status = err;
1335
1336                 pages++;
1337                 status++;
1338         }
1339
1340         up_read(&mm->mmap_sem);
1341 }
1342
1343 /*
1344  * Determine the nodes of a user array of pages and store it in
1345  * a user array of status.
1346  */
1347 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1348                          const void __user * __user *pages,
1349                          int __user *status)
1350 {
1351 #define DO_PAGES_STAT_CHUNK_NR 16
1352         const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1353         int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1354
1355         while (nr_pages) {
1356                 unsigned long chunk_nr;
1357
1358                 chunk_nr = nr_pages;
1359                 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1360                         chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1361
1362                 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1363                         break;
1364
1365                 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1366
1367                 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1368                         break;
1369
1370                 pages += chunk_nr;
1371                 status += chunk_nr;
1372                 nr_pages -= chunk_nr;
1373         }
1374         return nr_pages ? -EFAULT : 0;
1375 }
1376
1377 /*
1378  * Move a list of pages in the address space of the currently executing
1379  * process.
1380  */
1381 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1382                 const void __user * __user *, pages,
1383                 const int __user *, nodes,
1384                 int __user *, status, int, flags)
1385 {
1386         struct task_struct *task;
1387         struct mm_struct *mm;
1388         int err;
1389         nodemask_t task_nodes;
1390
1391         /* Check flags */
1392         if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1393                 return -EINVAL;
1394
1395         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1396                 return -EPERM;
1397
1398         /* Find the mm_struct */
1399         rcu_read_lock();
1400         task = pid ? find_task_by_vpid(pid) : current;
1401         if (!task) {
1402                 rcu_read_unlock();
1403                 return -ESRCH;
1404         }
1405         get_task_struct(task);
1406
1407         /*
1408          * Check if this process has the right to modify the specified
1409          * process. Use the regular "ptrace_may_access()" checks.
1410          */
1411         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1412                 rcu_read_unlock();
1413                 err = -EPERM;
1414                 goto out;
1415         }
1416         rcu_read_unlock();
1417
1418         err = security_task_movememory(task);
1419         if (err)
1420                 goto out;
1421
1422         task_nodes = cpuset_mems_allowed(task);
1423         mm = get_task_mm(task);
1424         put_task_struct(task);
1425
1426         if (!mm)
1427                 return -EINVAL;
1428
1429         if (nodes)
1430                 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1431                                     nodes, status, flags);
1432         else
1433                 err = do_pages_stat(mm, nr_pages, pages, status);
1434
1435         mmput(mm);
1436         return err;
1437
1438 out:
1439         put_task_struct(task);
1440         return err;
1441 }
1442
1443 /*
1444  * Call migration functions in the vma_ops that may prepare
1445  * memory in a vm for migration. migration functions may perform
1446  * the migration for vmas that do not have an underlying page struct.
1447  */
1448 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1449         const nodemask_t *from, unsigned long flags)
1450 {
1451         struct vm_area_struct *vma;
1452         int err = 0;
1453
1454         for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1455                 if (vma->vm_ops && vma->vm_ops->migrate) {
1456                         err = vma->vm_ops->migrate(vma, to, from, flags);
1457                         if (err)
1458                                 break;
1459                 }
1460         }
1461         return err;
1462 }
1463
1464 #ifdef CONFIG_NUMA_BALANCING
1465 /*
1466  * Returns true if this is a safe migration target node for misplaced NUMA
1467  * pages. Currently it only checks the watermarks which crude
1468  */
1469 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1470                                    unsigned long nr_migrate_pages)
1471 {
1472         int z;
1473         for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1474                 struct zone *zone = pgdat->node_zones + z;
1475
1476                 if (!populated_zone(zone))
1477                         continue;
1478
1479                 if (zone->all_unreclaimable)
1480                         continue;
1481
1482                 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1483                 if (!zone_watermark_ok(zone, 0,
1484                                        high_wmark_pages(zone) +
1485                                        nr_migrate_pages,
1486                                        0, 0))
1487                         continue;
1488                 return true;
1489         }
1490         return false;
1491 }
1492
1493 static struct page *alloc_misplaced_dst_page(struct page *page,
1494                                            unsigned long data,
1495                                            int **result)
1496 {
1497         int nid = (int) data;
1498         struct page *newpage;
1499
1500         newpage = alloc_pages_exact_node(nid,
1501                                          (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
1502                                           __GFP_NOMEMALLOC | __GFP_NORETRY |
1503                                           __GFP_NOWARN) &
1504                                          ~GFP_IOFS, 0);
1505         if (newpage)
1506                 page_nid_xchg_last(newpage, page_nid_last(page));
1507
1508         return newpage;
1509 }
1510
1511 /*
1512  * page migration rate limiting control.
1513  * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1514  * window of time. Default here says do not migrate more than 1280M per second.
1515  * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1516  * as it is faults that reset the window, pte updates will happen unconditionally
1517  * if there has not been a fault since @pteupdate_interval_millisecs after the
1518  * throttle window closed.
1519  */
1520 static unsigned int migrate_interval_millisecs __read_mostly = 100;
1521 static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1522 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1523
1524 /* Returns true if NUMA migration is currently rate limited */
1525 bool migrate_ratelimited(int node)
1526 {
1527         pg_data_t *pgdat = NODE_DATA(node);
1528
1529         if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1530                                 msecs_to_jiffies(pteupdate_interval_millisecs)))
1531                 return false;
1532
1533         if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1534                 return false;
1535
1536         return true;
1537 }
1538
1539 /* Returns true if the node is migrate rate-limited after the update */
1540 bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1541 {
1542         bool rate_limited = false;
1543
1544         /*
1545          * Rate-limit the amount of data that is being migrated to a node.
1546          * Optimal placement is no good if the memory bus is saturated and
1547          * all the time is being spent migrating!
1548          */
1549         spin_lock(&pgdat->numabalancing_migrate_lock);
1550         if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1551                 pgdat->numabalancing_migrate_nr_pages = 0;
1552                 pgdat->numabalancing_migrate_next_window = jiffies +
1553                         msecs_to_jiffies(migrate_interval_millisecs);
1554         }
1555         if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1556                 rate_limited = true;
1557         else
1558                 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1559         spin_unlock(&pgdat->numabalancing_migrate_lock);
1560
1561         return rate_limited;
1562 }
1563
1564 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1565 {
1566         int page_lru;
1567
1568         VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
1569
1570         /* Avoid migrating to a node that is nearly full */
1571         if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1572                 return 0;
1573
1574         if (isolate_lru_page(page))
1575                 return 0;
1576
1577         /*
1578          * migrate_misplaced_transhuge_page() skips page migration's usual
1579          * check on page_count(), so we must do it here, now that the page
1580          * has been isolated: a GUP pin, or any other pin, prevents migration.
1581          * The expected page count is 3: 1 for page's mapcount and 1 for the
1582          * caller's pin and 1 for the reference taken by isolate_lru_page().
1583          */
1584         if (PageTransHuge(page) && page_count(page) != 3) {
1585                 putback_lru_page(page);
1586                 return 0;
1587         }
1588
1589         page_lru = page_is_file_cache(page);
1590         mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1591                                 hpage_nr_pages(page));
1592
1593         /*
1594          * Isolating the page has taken another reference, so the
1595          * caller's reference can be safely dropped without the page
1596          * disappearing underneath us during migration.
1597          */
1598         put_page(page);
1599         return 1;
1600 }
1601
1602 /*
1603  * Attempt to migrate a misplaced page to the specified destination
1604  * node. Caller is expected to have an elevated reference count on
1605  * the page that will be dropped by this function before returning.
1606  */
1607 int migrate_misplaced_page(struct page *page, int node)
1608 {
1609         pg_data_t *pgdat = NODE_DATA(node);
1610         int isolated;
1611         int nr_remaining;
1612         LIST_HEAD(migratepages);
1613
1614         /*
1615          * Don't migrate pages that are mapped in multiple processes.
1616          * TODO: Handle false sharing detection instead of this hammer
1617          */
1618         if (page_mapcount(page) != 1)
1619                 goto out;
1620
1621         /*
1622          * Rate-limit the amount of data that is being migrated to a node.
1623          * Optimal placement is no good if the memory bus is saturated and
1624          * all the time is being spent migrating!
1625          */
1626         if (numamigrate_update_ratelimit(pgdat, 1))
1627                 goto out;
1628
1629         isolated = numamigrate_isolate_page(pgdat, page);
1630         if (!isolated)
1631                 goto out;
1632
1633         list_add(&page->lru, &migratepages);
1634         nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1635                                      node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1636         if (nr_remaining) {
1637                 putback_lru_pages(&migratepages);
1638                 isolated = 0;
1639         } else
1640                 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1641         BUG_ON(!list_empty(&migratepages));
1642         return isolated;
1643
1644 out:
1645         put_page(page);
1646         return 0;
1647 }
1648 #endif /* CONFIG_NUMA_BALANCING */
1649
1650 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1651 /*
1652  * Migrates a THP to a given target node. page must be locked and is unlocked
1653  * before returning.
1654  */
1655 int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1656                                 struct vm_area_struct *vma,
1657                                 pmd_t *pmd, pmd_t entry,
1658                                 unsigned long address,
1659                                 struct page *page, int node)
1660 {
1661         unsigned long haddr = address & HPAGE_PMD_MASK;
1662         pg_data_t *pgdat = NODE_DATA(node);
1663         int isolated = 0;
1664         struct page *new_page = NULL;
1665         struct mem_cgroup *memcg = NULL;
1666         int page_lru = page_is_file_cache(page);
1667
1668         /*
1669          * Don't migrate pages that are mapped in multiple processes.
1670          * TODO: Handle false sharing detection instead of this hammer
1671          */
1672         if (page_mapcount(page) != 1)
1673                 goto out_dropref;
1674
1675         /*
1676          * Rate-limit the amount of data that is being migrated to a node.
1677          * Optimal placement is no good if the memory bus is saturated and
1678          * all the time is being spent migrating!
1679          */
1680         if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1681                 goto out_dropref;
1682
1683         new_page = alloc_pages_node(node,
1684                 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1685         if (!new_page)
1686                 goto out_fail;
1687
1688         page_nid_xchg_last(new_page, page_nid_last(page));
1689
1690         isolated = numamigrate_isolate_page(pgdat, page);
1691         if (!isolated) {
1692                 put_page(new_page);
1693                 goto out_fail;
1694         }
1695
1696         /* Prepare a page as a migration target */
1697         __set_page_locked(new_page);
1698         SetPageSwapBacked(new_page);
1699
1700         /* anon mapping, we can simply copy page->mapping to the new page: */
1701         new_page->mapping = page->mapping;
1702         new_page->index = page->index;
1703         migrate_page_copy(new_page, page);
1704         WARN_ON(PageLRU(new_page));
1705
1706         /* Recheck the target PMD */
1707         spin_lock(&mm->page_table_lock);
1708         if (unlikely(!pmd_same(*pmd, entry))) {
1709                 spin_unlock(&mm->page_table_lock);
1710
1711                 /* Reverse changes made by migrate_page_copy() */
1712                 if (TestClearPageActive(new_page))
1713                         SetPageActive(page);
1714                 if (TestClearPageUnevictable(new_page))
1715                         SetPageUnevictable(page);
1716                 mlock_migrate_page(page, new_page);
1717
1718                 unlock_page(new_page);
1719                 put_page(new_page);             /* Free it */
1720
1721                 /* Retake the callers reference and putback on LRU */
1722                 get_page(page);
1723                 putback_lru_page(page);
1724                 mod_zone_page_state(page_zone(page),
1725                          NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1726
1727                 goto out_unlock;
1728         }
1729
1730         /*
1731          * Traditional migration needs to prepare the memcg charge
1732          * transaction early to prevent the old page from being
1733          * uncharged when installing migration entries.  Here we can
1734          * save the potential rollback and start the charge transfer
1735          * only when migration is already known to end successfully.
1736          */
1737         mem_cgroup_prepare_migration(page, new_page, &memcg);
1738
1739         entry = mk_pmd(new_page, vma->vm_page_prot);
1740         entry = pmd_mknonnuma(entry);
1741         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1742         entry = pmd_mkhuge(entry);
1743
1744         pmdp_clear_flush(vma, haddr, pmd);
1745         set_pmd_at(mm, haddr, pmd, entry);
1746         page_add_new_anon_rmap(new_page, vma, haddr);
1747         update_mmu_cache_pmd(vma, address, &entry);
1748         page_remove_rmap(page);
1749         /*
1750          * Finish the charge transaction under the page table lock to
1751          * prevent split_huge_page() from dividing up the charge
1752          * before it's fully transferred to the new page.
1753          */
1754         mem_cgroup_end_migration(memcg, page, new_page, true);
1755         spin_unlock(&mm->page_table_lock);
1756
1757         unlock_page(new_page);
1758         unlock_page(page);
1759         put_page(page);                 /* Drop the rmap reference */
1760         put_page(page);                 /* Drop the LRU isolation reference */
1761
1762         count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1763         count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1764
1765         mod_zone_page_state(page_zone(page),
1766                         NR_ISOLATED_ANON + page_lru,
1767                         -HPAGE_PMD_NR);
1768         return isolated;
1769
1770 out_fail:
1771         count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1772 out_dropref:
1773         entry = pmd_mknonnuma(entry);
1774         set_pmd_at(mm, haddr, pmd, entry);
1775         update_mmu_cache_pmd(vma, address, &entry);
1776
1777 out_unlock:
1778         unlock_page(page);
1779         put_page(page);
1780         return 0;
1781 }
1782 #endif /* CONFIG_NUMA_BALANCING */
1783
1784 #endif /* CONFIG_NUMA */