mm/gup.c

   1 #include <linux/kernel.h>
   2 #include <linux/errno.h>
   3 #include <linux/err.h>
   4 #include <linux/spinlock.h>
   5
   6 #include <linux/mm.h>
   7 #include <linux/memremap.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/rmap.h>
  10 #include <linux/swap.h>
  11 #include <linux/swapops.h>
  12
  13 #include <linux/sched/signal.h>
  14 #include <linux/rwsem.h>
  15 #include <linux/hugetlb.h>
  16
  17 #include <asm/mmu_context.h>
  18 #include <asm/pgtable.h>
  19 #include <asm/tlbflush.h>
  20
  21 #include <linux/migrate.h>
  22 #include <linux/mm_inline.h>
  23 #include <linux/mmu_notifier.h>
  24 #include <asm/tlbflush.h>
  25
  26 #include "internal.h"
  27
  28 #ifdef CONFIG_CMA
  29 static struct page *__alloc_nonmovable_userpage(struct page *page,
  30                                 unsigned long private, int **result)
  31 {
  32         return alloc_page(GFP_HIGHUSER);
  33 }
  34
  35 static bool __need_migrate_cma_page(struct page *page,
  36                                 struct vm_area_struct *vma,
  37                                 unsigned long start, unsigned int flags)
  38 {
  39         if (!(flags & FOLL_GET) || !(flags & FOLL_CMA))
  40                 return false;
  41
  42         if (!is_migrate_cma_page(page))
  43                 return false;
  44
  45         if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
  46                                         VM_STACK_INCOMPLETE_SETUP)
  47                 return false;
  48
  49         if (!PageLRU(page)) {
  50                 migrate_prep_local();
  51                 if (WARN_ON(!PageLRU(page))) {
  52                         __dump_page(page, "non-lru cma page");
  53                         return false;
  54                 }
  55         }
  56
  57         return true;
  58 }
  59
  60 static int __isolate_cma_pinpage(struct page *page)
  61 {
  62         struct zone *zone = page_zone(page);
  63         struct lruvec *lruvec;
  64
  65         spin_lock_irq(zone_lru_lock(zone));
  66         if (__isolate_lru_page(page, 0) != 0) {
  67                 spin_unlock_irq(zone_lru_lock(zone));
  68                 dump_page(page, "failed to isolate lru page");
  69                 return -EBUSY;
  70         } else {
  71                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
  72                 del_page_from_lru_list(page, lruvec, page_lru(page));
  73         }
  74         spin_unlock_irq(zone_lru_lock(zone));
  75
  76         return 0;
  77 }
  78
  79 static int __migrate_cma_pinpage(struct page *page, struct vm_area_struct *vma)
  80 {
  81         struct list_head migratepages;
  82         int tries = 0;
  83         int ret = 0;
  84
  85         INIT_LIST_HEAD(&migratepages);
  86
  87         list_add(&page->lru, &migratepages);
  88         inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
  89
  90         while (!list_empty(&migratepages) && tries++ < 5) {
  91                 ret = migrate_pages(&migratepages, __alloc_nonmovable_userpage,
  92                                         NULL, 0, MIGRATE_SYNC, MR_CMA);
  93         }
  94
  95         if (ret < 0) {
  96                 putback_movable_pages(&migratepages);
  97                 pr_err("%s: migration failed %p[%#lx]\n", __func__,
  98                                         page, page_to_pfn(page));
  99                 return -EFAULT;
 100         }
 101
 102         return 0;
 103 }
 104 #else
 105 static bool __need_migrate_cma_page(struct page *page,
 106                                 struct vm_area_struct *vma,
 107                                 unsigned long start, unsigned int flags)
 108 {
 109         return false;
 110 }
 111 static int __migrate_cma_pinpage(struct page *page, struct vm_area_struct *vma)
 112 {
 113         return 0;
 114 }
 115 #endif
 116
 117 static struct page *no_page_table(struct vm_area_struct *vma,
 118                 unsigned int flags)
 119 {
 120         /*
 121          * When core dumping an enormous anonymous area that nobody
 122          * has touched so far, we don't want to allocate unnecessary pages or
 123          * page tables.  Return error instead of NULL to skip handle_mm_fault,
 124          * then get_dump_page() will return NULL to leave a hole in the dump.
 125          * But we can only make this optimization where a hole would surely
 126          * be zero-filled if handle_mm_fault() actually did handle it.
 127          */
 128         if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
 129                 return ERR_PTR(-EFAULT);
 130         return NULL;
 131 }
 132
 133 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 134                 pte_t *pte, unsigned int flags)
 135 {
 136         /* No page to get reference */
 137         if (flags & FOLL_GET)
 138                 return -EFAULT;
 139
 140         if (flags & FOLL_TOUCH) {
 141                 pte_t entry = *pte;
 142
 143                 if (flags & FOLL_WRITE)
 144                         entry = pte_mkdirty(entry);
 145                 entry = pte_mkyoung(entry);
 146
 147                 if (!pte_same(*pte, entry)) {
 148                         set_pte_at(vma->vm_mm, address, pte, entry);
 149                         update_mmu_cache(vma, address, pte);
 150                 }
 151         }
 152
 153         /* Proper page table entry exists, but no corresponding struct page */
 154         return -EEXIST;
 155 }
 156
 157 /*
 158  * FOLL_FORCE can write to even unwritable pte's, but only
 159  * after we've gone through a COW cycle and they are dirty.
 160  */
 161 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 162 {
 163         return pte_write(pte) ||
 164                 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 165 }
 166
 167 static struct page *follow_page_pte(struct vm_area_struct *vma,
 168                 unsigned long address, pmd_t *pmd, unsigned int flags)
 169 {
 170         struct mm_struct *mm = vma->vm_mm;
 171         struct dev_pagemap *pgmap = NULL;
 172         struct page *page;
 173         spinlock_t *ptl;
 174         pte_t *ptep, pte;
 175
 176 retry:
 177         if (unlikely(pmd_bad(*pmd)))
 178                 return no_page_table(vma, flags);
 179
 180         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 181         pte = *ptep;
 182         if (!pte_present(pte)) {
 183                 swp_entry_t entry;
 184                 /*
 185                  * KSM's break_ksm() relies upon recognizing a ksm page
 186                  * even while it is being migrated, so for that case we
 187                  * need migration_entry_wait().
 188                  */
 189                 if (likely(!(flags & FOLL_MIGRATION)))
 190                         goto no_page;
 191                 if (pte_none(pte))
 192                         goto no_page;
 193                 entry = pte_to_swp_entry(pte);
 194                 if (!is_migration_entry(entry))
 195                         goto no_page;
 196                 pte_unmap_unlock(ptep, ptl);
 197                 migration_entry_wait(mm, pmd, address);
 198                 goto retry;
 199         }
 200         if ((flags & FOLL_NUMA) && pte_protnone(pte))
 201                 goto no_page;
 202         if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
 203                 pte_unmap_unlock(ptep, ptl);
 204                 return NULL;
 205         }
 206
 207         page = vm_normal_page(vma, address, pte);
 208         if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
 209                 /*
 210                  * Only return device mapping pages in the FOLL_GET case since
 211                  * they are only valid while holding the pgmap reference.
 212                  */
 213                 pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
 214                 if (pgmap)
 215                         page = pte_page(pte);
 216                 else
 217                         goto no_page;
 218         } else if (unlikely(!page)) {
 219                 if (flags & FOLL_DUMP) {
 220                         /* Avoid special (like zero) pages in core dumps */
 221                         page = ERR_PTR(-EFAULT);
 222                         goto out;
 223                 }
 224
 225                 if (is_zero_pfn(pte_pfn(pte))) {
 226                         page = pte_page(pte);
 227                 } else {
 228                         int ret;
 229
 230                         ret = follow_pfn_pte(vma, address, ptep, flags);
 231                         page = ERR_PTR(ret);
 232                         goto out;
 233                 }
 234         }
 235
 236         if (__need_migrate_cma_page(page, vma, address, flags)) {
 237                 if (__isolate_cma_pinpage(page)) {
 238                         pr_warn("%s: Failed to migrate a cma page\n", __func__);
 239                         pr_warn("because of racing with compaction.\n");
 240                         WARN(1, "Please try again get_user_pages()\n");
 241                         page = ERR_PTR(-EBUSY);
 242                         goto out;
 243                 }
 244                 pte_unmap_unlock(ptep, ptl);
 245                 if (__migrate_cma_pinpage(page, vma)) {
 246                         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 247                 } else {
 248                         struct page *old_page = page;
 249
 250                         migration_entry_wait(mm, pmd, address);
 251                         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 252                         update_mmu_cache(vma, address, ptep);
 253                         pte = *ptep;
 254                         set_pte_at_notify(mm, address, ptep, pte);
 255                         page = vm_normal_page(vma, address, pte);
 256                         BUG_ON(!page);
 257
 258                         pr_debug("cma: cma page %p[%#lx] migrated to new "
 259                                         "page %p[%#lx]\n", old_page,
 260                                         page_to_pfn(old_page),
 261                                         page, page_to_pfn(page));
 262                 }
 263         }
 264
 265         if (flags & FOLL_SPLIT && PageTransCompound(page)) {
 266                 int ret;
 267                 get_page(page);
 268                 pte_unmap_unlock(ptep, ptl);
 269                 lock_page(page);
 270                 ret = split_huge_page(page);
 271                 unlock_page(page);
 272                 put_page(page);
 273                 if (ret)
 274                         return ERR_PTR(ret);
 275                 goto retry;
 276         }
 277
 278         if (flags & FOLL_GET) {
 279                 get_page(page);
 280
 281                 /* drop the pgmap reference now that we hold the page */
 282                 if (pgmap) {
 283                         put_dev_pagemap(pgmap);
 284                         pgmap = NULL;
 285                 }
 286         }
 287         if (flags & FOLL_TOUCH) {
 288                 if ((flags & FOLL_WRITE) &&
 289                     !pte_dirty(pte) && !PageDirty(page))
 290                         set_page_dirty(page);
 291                 /*
 292                  * pte_mkyoung() would be more correct here, but atomic care
 293                  * is needed to avoid losing the dirty bit: it is easier to use
 294                  * mark_page_accessed().
 295                  */
 296                 mark_page_accessed(page);
 297         }
 298         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 299                 /* Do not mlock pte-mapped THP */
 300                 if (PageTransCompound(page))
 301                         goto out;
 302
 303                 /*
 304                  * The preliminary mapping check is mainly to avoid the
 305                  * pointless overhead of lock_page on the ZERO_PAGE
 306                  * which might bounce very badly if there is contention.
 307                  *
 308                  * If the page is already locked, we don't need to
 309                  * handle it now - vmscan will handle it later if and
 310                  * when it attempts to reclaim the page.
 311                  */
 312                 if (page->mapping && trylock_page(page)) {
 313                         lru_add_drain();  /* push cached pages to LRU */
 314                         /*
 315                          * Because we lock page here, and migration is
 316                          * blocked by the pte's page reference, and we
 317                          * know the page is still mapped, we don't even
 318                          * need to check for file-cache page truncation.
 319                          */
 320                         mlock_vma_page(page);
 321                         unlock_page(page);
 322                 }
 323         }
 324 out:
 325         pte_unmap_unlock(ptep, ptl);
 326         return page;
 327 no_page:
 328         pte_unmap_unlock(ptep, ptl);
 329         if (!pte_none(pte))
 330                 return NULL;
 331         return no_page_table(vma, flags);
 332 }
 333
 334 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 335                                     unsigned long address, pud_t *pudp,
 336                                     unsigned int flags, unsigned int *page_mask)
 337 {
 338         pmd_t *pmd;
 339         spinlock_t *ptl;
 340         struct page *page;
 341         struct mm_struct *mm = vma->vm_mm;
 342
 343         pmd = pmd_offset(pudp, address);
 344         if (pmd_none(*pmd))
 345                 return no_page_table(vma, flags);
 346         if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
 347                 page = follow_huge_pmd(mm, address, pmd, flags);
 348                 if (page)
 349                         return page;
 350                 return no_page_table(vma, flags);
 351         }
 352         if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
 353                 page = follow_huge_pd(vma, address,
 354                                       __hugepd(pmd_val(*pmd)), flags,
 355                                       PMD_SHIFT);
 356                 if (page)
 357                         return page;
 358                 return no_page_table(vma, flags);
 359         }
 360 retry:
 361         if (!pmd_present(*pmd)) {
 362                 if (likely(!(flags & FOLL_MIGRATION)))
 363                         return no_page_table(vma, flags);
 364                 VM_BUG_ON(thp_migration_supported() &&
 365                                   !is_pmd_migration_entry(*pmd));
 366                 if (is_pmd_migration_entry(*pmd))
 367                         pmd_migration_entry_wait(mm, pmd);
 368                 goto retry;
 369         }
 370         if (pmd_devmap(*pmd)) {
 371                 ptl = pmd_lock(mm, pmd);
 372                 page = follow_devmap_pmd(vma, address, pmd, flags);
 373                 spin_unlock(ptl);
 374                 if (page)
 375                         return page;
 376         }
 377         if (likely(!pmd_trans_huge(*pmd)))
 378                 return follow_page_pte(vma, address, pmd, flags);
 379
 380         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
 381                 return no_page_table(vma, flags);
 382
 383 retry_locked:
 384         ptl = pmd_lock(mm, pmd);
 385         if (unlikely(!pmd_present(*pmd))) {
 386                 spin_unlock(ptl);
 387                 if (likely(!(flags & FOLL_MIGRATION)))
 388                         return no_page_table(vma, flags);
 389                 pmd_migration_entry_wait(mm, pmd);
 390                 goto retry_locked;
 391         }
 392         if (unlikely(!pmd_trans_huge(*pmd))) {
 393                 spin_unlock(ptl);
 394                 return follow_page_pte(vma, address, pmd, flags);
 395         }
 396         if (flags & FOLL_SPLIT) {
 397                 int ret;
 398                 page = pmd_page(*pmd);
 399                 if (is_huge_zero_page(page)) {
 400                         spin_unlock(ptl);
 401                         ret = 0;
 402                         split_huge_pmd(vma, pmd, address);
 403                         if (pmd_trans_unstable(pmd))
 404                                 ret = -EBUSY;
 405                 } else {
 406                         get_page(page);
 407                         spin_unlock(ptl);
 408                         lock_page(page);
 409                         ret = split_huge_page(page);
 410                         unlock_page(page);
 411                         put_page(page);
 412                         if (pmd_none(*pmd))
 413                                 return no_page_table(vma, flags);
 414                 }
 415
 416                 return ret ? ERR_PTR(ret) :
 417                         follow_page_pte(vma, address, pmd, flags);
 418         }
 419         page = follow_trans_huge_pmd(vma, address, pmd, flags);
 420         spin_unlock(ptl);
 421         *page_mask = HPAGE_PMD_NR - 1;
 422         return page;
 423 }
 424
 425
 426 static struct page *follow_pud_mask(struct vm_area_struct *vma,
 427                                     unsigned long address, p4d_t *p4dp,
 428                                     unsigned int flags, unsigned int *page_mask)
 429 {
 430         pud_t *pud;
 431         spinlock_t *ptl;
 432         struct page *page;
 433         struct mm_struct *mm = vma->vm_mm;
 434
 435         pud = pud_offset(p4dp, address);
 436         if (pud_none(*pud))
 437                 return no_page_table(vma, flags);
 438         if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
 439                 page = follow_huge_pud(mm, address, pud, flags);
 440                 if (page)
 441                         return page;
 442                 return no_page_table(vma, flags);
 443         }
 444         if (is_hugepd(__hugepd(pud_val(*pud)))) {
 445                 page = follow_huge_pd(vma, address,
 446                                       __hugepd(pud_val(*pud)), flags,
 447                                       PUD_SHIFT);
 448                 if (page)
 449                         return page;
 450                 return no_page_table(vma, flags);
 451         }
 452         if (pud_devmap(*pud)) {
 453                 ptl = pud_lock(mm, pud);
 454                 page = follow_devmap_pud(vma, address, pud, flags);
 455                 spin_unlock(ptl);
 456                 if (page)
 457                         return page;
 458         }
 459         if (unlikely(pud_bad(*pud)))
 460                 return no_page_table(vma, flags);
 461
 462         return follow_pmd_mask(vma, address, pud, flags, page_mask);
 463 }
 464
 465
 466 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 467                                     unsigned long address, pgd_t *pgdp,
 468                                     unsigned int flags, unsigned int *page_mask)
 469 {
 470         p4d_t *p4d;
 471         struct page *page;
 472
 473         p4d = p4d_offset(pgdp, address);
 474         if (p4d_none(*p4d))
 475                 return no_page_table(vma, flags);
 476         BUILD_BUG_ON(p4d_huge(*p4d));
 477         if (unlikely(p4d_bad(*p4d)))
 478                 return no_page_table(vma, flags);
 479
 480         if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
 481                 page = follow_huge_pd(vma, address,
 482                                       __hugepd(p4d_val(*p4d)), flags,
 483                                       P4D_SHIFT);
 484                 if (page)
 485                         return page;
 486                 return no_page_table(vma, flags);
 487         }
 488         return follow_pud_mask(vma, address, p4d, flags, page_mask);
 489 }
 490
 491 /**
 492  * follow_page_mask - look up a page descriptor from a user-virtual address
 493  * @vma: vm_area_struct mapping @address
 494  * @address: virtual address to look up
 495  * @flags: flags modifying lookup behaviour
 496  * @page_mask: on output, *page_mask is set according to the size of the page
 497  *
 498  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 499  *
 500  * Returns the mapped (struct page *), %NULL if no mapping exists, or
 501  * an error pointer if there is a mapping to something not represented
 502  * by a page descriptor (see also vm_normal_page()).
 503  */
 504 struct page *follow_page_mask(struct vm_area_struct *vma,
 505                               unsigned long address, unsigned int flags,
 506                               unsigned int *page_mask)
 507 {
 508         pgd_t *pgd;
 509         struct page *page;
 510         struct mm_struct *mm = vma->vm_mm;
 511
 512         *page_mask = 0;
 513
 514         /* make this handle hugepd */
 515         page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 516         if (!IS_ERR(page)) {
 517                 BUG_ON(flags & FOLL_GET);
 518                 return page;
 519         }
 520
 521         pgd = pgd_offset(mm, address);
 522
 523         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 524                 return no_page_table(vma, flags);
 525
 526         if (pgd_huge(*pgd)) {
 527                 page = follow_huge_pgd(mm, address, pgd, flags);
 528                 if (page)
 529                         return page;
 530                 return no_page_table(vma, flags);
 531         }
 532         if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 533                 page = follow_huge_pd(vma, address,
 534                                       __hugepd(pgd_val(*pgd)), flags,
 535                                       PGDIR_SHIFT);
 536                 if (page)
 537                         return page;
 538                 return no_page_table(vma, flags);
 539         }
 540
 541         return follow_p4d_mask(vma, address, pgd, flags, page_mask);
 542 }
 543
 544 static int get_gate_page(struct mm_struct *mm, unsigned long address,
 545                 unsigned int gup_flags, struct vm_area_struct **vma,
 546                 struct page **page)
 547 {
 548         pgd_t *pgd;
 549         p4d_t *p4d;
 550         pud_t *pud;
 551         pmd_t *pmd;
 552         pte_t *pte;
 553         int ret = -EFAULT;
 554
 555         /* user gate pages are read-only */
 556         if (gup_flags & FOLL_WRITE)
 557                 return -EFAULT;
 558         if (address > TASK_SIZE)
 559                 pgd = pgd_offset_k(address);
 560         else
 561                 pgd = pgd_offset_gate(mm, address);
 562         BUG_ON(pgd_none(*pgd));
 563         p4d = p4d_offset(pgd, address);
 564         BUG_ON(p4d_none(*p4d));
 565         pud = pud_offset(p4d, address);
 566         BUG_ON(pud_none(*pud));
 567         pmd = pmd_offset(pud, address);
 568         if (!pmd_present(*pmd))
 569                 return -EFAULT;
 570         VM_BUG_ON(pmd_trans_huge(*pmd));
 571         pte = pte_offset_map(pmd, address);
 572         if (pte_none(*pte))
 573                 goto unmap;
 574         *vma = get_gate_vma(mm);
 575         if (!page)
 576                 goto out;
 577         *page = vm_normal_page(*vma, address, *pte);
 578         if (!*page) {
 579                 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 580                         goto unmap;
 581                 *page = pte_page(*pte);
 582
 583                 /*
 584                  * This should never happen (a device public page in the gate
 585                  * area).
 586                  */
 587                 if (is_device_public_page(*page))
 588                         goto unmap;
 589         }
 590         get_page(*page);
 591 out:
 592         ret = 0;
 593 unmap:
 594         pte_unmap(pte);
 595         return ret;
 596 }
 597
 598 /*
 599  * mmap_sem must be held on entry.  If @nonblocking != NULL and
 600  * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 601  * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 602  */
 603 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 604                 unsigned long address, unsigned int *flags, int *nonblocking)
 605 {
 606         unsigned int fault_flags = 0;
 607         int ret;
 608
 609         /* mlock all present pages, but do not fault in new pages */
 610         if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
 611                 return -ENOENT;
 612         if (*flags & FOLL_WRITE)
 613                 fault_flags |= FAULT_FLAG_WRITE;
 614         if (*flags & FOLL_REMOTE)
 615                 fault_flags |= FAULT_FLAG_REMOTE;
 616         if (nonblocking)
 617                 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 618         if (*flags & FOLL_NOWAIT)
 619                 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
 620         if (*flags & FOLL_TRIED) {
 621                 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
 622                 fault_flags |= FAULT_FLAG_TRIED;
 623         }
 624
 625         ret = handle_mm_fault(vma, address, fault_flags);
 626         if (ret & VM_FAULT_ERROR) {
 627                 int err = vm_fault_to_errno(ret, *flags);
 628
 629                 if (err)
 630                         return err;
 631                 BUG();
 632         }
 633
 634         if (tsk) {
 635                 if (ret & VM_FAULT_MAJOR)
 636                         tsk->maj_flt++;
 637                 else
 638                         tsk->min_flt++;
 639         }
 640
 641         if (ret & VM_FAULT_RETRY) {
 642                 if (nonblocking)
 643                         *nonblocking = 0;
 644                 return -EBUSY;
 645         }
 646
 647         /*
 648          * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
 649          * necessary, even if maybe_mkwrite decided not to set pte_write. We
 650          * can thus safely do subsequent page lookups as if they were reads.
 651          * But only do so when looping for pte_write is futile: in some cases
 652          * userspace may also be wanting to write to the gotten user page,
 653          * which a read fault here might prevent (a readonly page might get
 654          * reCOWed by userspace write).
 655          */
 656         if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
 657                 *flags |= FOLL_COW;
 658         return 0;
 659 }
 660
 661 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 662 {
 663         vm_flags_t vm_flags = vma->vm_flags;
 664         int write = (gup_flags & FOLL_WRITE);
 665         int foreign = (gup_flags & FOLL_REMOTE);
 666
 667         if (vm_flags & (VM_IO | VM_PFNMAP))
 668                 return -EFAULT;
 669
 670         if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
 671                 return -EFAULT;
 672
 673         if (write) {
 674                 if (!(vm_flags & VM_WRITE)) {
 675                         if (!(gup_flags & FOLL_FORCE))
 676                                 return -EFAULT;
 677                         /*
 678                          * We used to let the write,force case do COW in a
 679                          * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
 680                          * set a breakpoint in a read-only mapping of an
 681                          * executable, without corrupting the file (yet only
 682                          * when that file had been opened for writing!).
 683                          * Anon pages in shared mappings are surprising: now
 684                          * just reject it.
 685                          */
 686                         if (!is_cow_mapping(vm_flags))
 687                                 return -EFAULT;
 688                 }
 689         } else if (!(vm_flags & VM_READ)) {
 690                 if (!(gup_flags & FOLL_FORCE))
 691                         return -EFAULT;
 692                 /*
 693                  * Is there actually any vma we can reach here which does not
 694                  * have VM_MAYREAD set?
 695                  */
 696                 if (!(vm_flags & VM_MAYREAD))
 697                         return -EFAULT;
 698         }
 699         /*
 700          * gups are always data accesses, not instruction
 701          * fetches, so execute=false here
 702          */
 703         if (!arch_vma_access_permitted(vma, write, false, foreign))
 704                 return -EFAULT;
 705         return 0;
 706 }
 707
 708 /**
 709  * __get_user_pages() - pin user pages in memory
 710  * @tsk:        task_struct of target task
 711  * @mm:         mm_struct of target mm
 712  * @start:      starting user address
 713  * @nr_pages:   number of pages from start to pin
 714  * @gup_flags:  flags modifying pin behaviour
 715  * @pages:      array that receives pointers to the pages pinned.
 716  *              Should be at least nr_pages long. Or NULL, if caller
 717  *              only intends to ensure the pages are faulted in.
 718  * @vmas:       array of pointers to vmas corresponding to each page.
 719  *              Or NULL if the caller does not require them.
 720  * @nonblocking: whether waiting for disk IO or mmap_sem contention
 721  *
 722  * Returns number of pages pinned. This may be fewer than the number
 723  * requested. If nr_pages is 0 or negative, returns 0. If no pages
 724  * were pinned, returns -errno. Each page returned must be released
 725  * with a put_page() call when it is finished with. vmas will only
 726  * remain valid while mmap_sem is held.
 727  *
 728  * Must be called with mmap_sem held.  It may be released.  See below.
 729  *
 730  * __get_user_pages walks a process's page tables and takes a reference to
 731  * each struct page that each user address corresponds to at a given
 732  * instant. That is, it takes the page that would be accessed if a user
 733  * thread accesses the given user virtual address at that instant.
 734  *
 735  * This does not guarantee that the page exists in the user mappings when
 736  * __get_user_pages returns, and there may even be a completely different
 737  * page there in some cases (eg. if mmapped pagecache has been invalidated
 738  * and subsequently re faulted). However it does guarantee that the page
 739  * won't be freed completely. And mostly callers simply care that the page
 740  * contains data that was valid *at some point in time*. Typically, an IO
 741  * or similar operation cannot guarantee anything stronger anyway because
 742  * locks can't be held over the syscall boundary.
 743  *
 744  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 745  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 746  * appropriate) must be called after the page is finished with, and
 747  * before put_page is called.
 748  *
 749  * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 750  * or mmap_sem contention, and if waiting is needed to pin all pages,
 751  * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 752  * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 753  * this case.
 754  *
 755  * A caller using such a combination of @nonblocking and @gup_flags
 756  * must therefore hold the mmap_sem for reading only, and recognize
 757  * when it's been released.  Otherwise, it must be held for either
 758  * reading or writing and will not be released.
 759  *
 760  * In most cases, get_user_pages or get_user_pages_fast should be used
 761  * instead of __get_user_pages. __get_user_pages should be used only if
 762  * you need some special @gup_flags.
 763  */
 764 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 765                 unsigned long start, unsigned long nr_pages,
 766                 unsigned int gup_flags, struct page **pages,
 767                 struct vm_area_struct **vmas, int *nonblocking)
 768 {
 769         long i = 0;
 770         unsigned int page_mask;
 771         struct vm_area_struct *vma = NULL;
 772
 773         if (!nr_pages)
 774                 return 0;
 775
 776         VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 777
 778         /*
 779          * If FOLL_FORCE is set then do not force a full fault as the hinting
 780          * fault information is unrelated to the reference behaviour of a task
 781          * using the address space
 782          */
 783         if (!(gup_flags & FOLL_FORCE))
 784                 gup_flags |= FOLL_NUMA;
 785
 786         if ((gup_flags & FOLL_CMA) != 0)
 787                 migrate_prep();
 788
 789         do {
 790                 struct page *page;
 791                 unsigned int foll_flags = gup_flags;
 792                 unsigned int page_increm;
 793
 794                 /* first iteration or cross vma bound */
 795                 if (!vma || start >= vma->vm_end) {
 796                         vma = find_extend_vma(mm, start);
 797                         if (!vma && in_gate_area(mm, start)) {
 798                                 int ret;
 799                                 ret = get_gate_page(mm, start & PAGE_MASK,
 800                                                 gup_flags, &vma,
 801                                                 pages ? &pages[i] : NULL);
 802                                 if (ret)
 803                                         return i ? : ret;
 804                                 page_mask = 0;
 805                                 goto next_page;
 806                         }
 807
 808                         if (!vma || check_vma_flags(vma, gup_flags))
 809                                 return i ? : -EFAULT;
 810                         if (is_vm_hugetlb_page(vma)) {
 811                                 i = follow_hugetlb_page(mm, vma, pages, vmas,
 812                                                 &start, &nr_pages, i,
 813                                                 gup_flags, nonblocking);
 814                                 continue;
 815                         }
 816                 }
 817 retry:
 818                 /*
 819                  * If we have a pending SIGKILL, don't keep faulting pages and
 820                  * potentially allocating memory.
 821                  */
 822                 if (unlikely(fatal_signal_pending(current)))
 823                         return i ? i : -ERESTARTSYS;
 824                 cond_resched();
 825                 page = follow_page_mask(vma, start, foll_flags, &page_mask);
 826                 if (!page) {
 827                         int ret;
 828                         ret = faultin_page(tsk, vma, start, &foll_flags,
 829                                         nonblocking);
 830                         switch (ret) {
 831                         case 0:
 832                                 goto retry;
 833                         case -EFAULT:
 834                         case -ENOMEM:
 835                         case -EHWPOISON:
 836                                 return i ? i : ret;
 837                         case -EBUSY:
 838                                 return i;
 839                         case -ENOENT:
 840                                 goto next_page;
 841                         }
 842                         BUG();
 843                 } else if (PTR_ERR(page) == -EEXIST) {
 844                         /*
 845                          * Proper page table entry exists, but no corresponding
 846                          * struct page.
 847                          */
 848                         goto next_page;
 849                 } else if (IS_ERR(page)) {
 850                         return i ? i : PTR_ERR(page);
 851                 }
 852                 if (pages) {
 853                         pages[i] = page;
 854                         flush_anon_page(vma, page, start);
 855                         flush_dcache_page(page);
 856                         page_mask = 0;
 857                 }
 858 next_page:
 859                 if (vmas) {
 860                         vmas[i] = vma;
 861                         page_mask = 0;
 862                 }
 863                 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
 864                 if (page_increm > nr_pages)
 865                         page_increm = nr_pages;
 866                 i += page_increm;
 867                 start += page_increm * PAGE_SIZE;
 868                 nr_pages -= page_increm;
 869         } while (nr_pages);
 870         return i;
 871 }
 872
 873 static bool vma_permits_fault(struct vm_area_struct *vma,
 874                               unsigned int fault_flags)
 875 {
 876         bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
 877         bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
 878         vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
 879
 880         if (!(vm_flags & vma->vm_flags))
 881                 return false;
 882
 883         /*
 884          * The architecture might have a hardware protection
 885          * mechanism other than read/write that can deny access.
 886          *
 887          * gup always represents data access, not instruction
 888          * fetches, so execute=false here:
 889          */
 890         if (!arch_vma_access_permitted(vma, write, false, foreign))
 891                 return false;
 892
 893         return true;
 894 }
 895
 896 /*
 897  * fixup_user_fault() - manually resolve a user page fault
 898  * @tsk:        the task_struct to use for page fault accounting, or
 899  *              NULL if faults are not to be recorded.
 900  * @mm:         mm_struct of target mm
 901  * @address:    user address
 902  * @fault_flags:flags to pass down to handle_mm_fault()
 903  * @unlocked:   did we unlock the mmap_sem while retrying, maybe NULL if caller
 904  *              does not allow retry
 905  *
 906  * This is meant to be called in the specific scenario where for locking reasons
 907  * we try to access user memory in atomic context (within a pagefault_disable()
 908  * section), this returns -EFAULT, and we want to resolve the user fault before
 909  * trying again.
 910  *
 911  * Typically this is meant to be used by the futex code.
 912  *
 913  * The main difference with get_user_pages() is that this function will
 914  * unconditionally call handle_mm_fault() which will in turn perform all the
 915  * necessary SW fixup of the dirty and young bits in the PTE, while
 916  * get_user_pages() only guarantees to update these in the struct page.
 917  *
 918  * This is important for some architectures where those bits also gate the
 919  * access permission to the page because they are maintained in software.  On
 920  * such architectures, gup() will not be enough to make a subsequent access
 921  * succeed.
 922  *
 923  * This function will not return with an unlocked mmap_sem. So it has not the
 924  * same semantics wrt the @mm->mmap_sem as does filemap_fault().
 925  */
 926 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 927                      unsigned long address, unsigned int fault_flags,
 928                      bool *unlocked)
 929 {
 930         struct vm_area_struct *vma;
 931         int ret, major = 0;
 932
 933         if (unlocked)
 934                 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 935
 936 retry:
 937         vma = find_extend_vma(mm, address);
 938         if (!vma || address < vma->vm_start)
 939                 return -EFAULT;
 940
 941         if (!vma_permits_fault(vma, fault_flags))
 942                 return -EFAULT;
 943
 944         ret = handle_mm_fault(vma, address, fault_flags);
 945         major |= ret & VM_FAULT_MAJOR;
 946         if (ret & VM_FAULT_ERROR) {
 947                 int err = vm_fault_to_errno(ret, 0);
 948
 949                 if (err)
 950                         return err;
 951                 BUG();
 952         }
 953
 954         if (ret & VM_FAULT_RETRY) {
 955                 down_read(&mm->mmap_sem);
 956                 if (!(fault_flags & FAULT_FLAG_TRIED)) {
 957                         *unlocked = true;
 958                         fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
 959                         fault_flags |= FAULT_FLAG_TRIED;
 960                         goto retry;
 961                 }
 962         }
 963
 964         if (tsk) {
 965                 if (major)
 966                         tsk->maj_flt++;
 967                 else
 968                         tsk->min_flt++;
 969         }
 970         return 0;
 971 }
 972 EXPORT_SYMBOL_GPL(fixup_user_fault);
 973
 974 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 975                                                 struct mm_struct *mm,
 976                                                 unsigned long start,
 977                                                 unsigned long nr_pages,
 978                                                 struct page **pages,
 979                                                 struct vm_area_struct **vmas,
 980                                                 int *locked, bool notify_drop,
 981                                                 unsigned int flags)
 982 {
 983         long ret, pages_done;
 984         bool lock_dropped;
 985
 986         if (locked) {
 987                 /* if VM_FAULT_RETRY can be returned, vmas become invalid */
 988                 BUG_ON(vmas);
 989                 /* check caller initialized locked */
 990                 BUG_ON(*locked != 1);
 991         }
 992
 993         if (pages)
 994                 flags |= FOLL_GET;
 995
 996         pages_done = 0;
 997         lock_dropped = false;
 998         for (;;) {
 999                 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
1000                                        vmas, locked);
1001                 if (!locked)
1002                         /* VM_FAULT_RETRY couldn't trigger, bypass */
1003                         return ret;
1004
1005                 /* VM_FAULT_RETRY cannot return errors */
1006                 if (!*locked) {
1007                         BUG_ON(ret < 0);
1008                         BUG_ON(ret >= nr_pages);
1009                 }
1010
1011                 if (!pages)
1012                         /* If it's a prefault don't insist harder */
1013                         return ret;
1014
1015                 if (ret > 0) {
1016                         nr_pages -= ret;
1017                         pages_done += ret;
1018                         if (!nr_pages)
1019                                 break;
1020                 }
1021                 if (*locked) {
1022                         /* VM_FAULT_RETRY didn't trigger */
1023                         if (!pages_done)
1024                                 pages_done = ret;
1025                         break;
1026                 }
1027                 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
1028                 pages += ret;
1029                 start += ret << PAGE_SHIFT;
1030
1031                 /*
1032                  * Repeat on the address that fired VM_FAULT_RETRY
1033                  * without FAULT_FLAG_ALLOW_RETRY but with
1034                  * FAULT_FLAG_TRIED.
1035                  */
1036                 *locked = 1;
1037                 lock_dropped = true;
1038                 down_read(&mm->mmap_sem);
1039                 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
1040                                        pages, NULL, NULL);
1041                 if (ret != 1) {
1042                         BUG_ON(ret > 1);
1043                         if (!pages_done)
1044                                 pages_done = ret;
1045                         break;
1046                 }
1047                 nr_pages--;
1048                 pages_done++;
1049                 if (!nr_pages)
1050                         break;
1051                 pages++;
1052                 start += PAGE_SIZE;
1053         }
1054         if (notify_drop && lock_dropped && *locked) {
1055                 /*
1056                  * We must let the caller know we temporarily dropped the lock
1057                  * and so the critical section protected by it was lost.
1058                  */
1059                 up_read(&mm->mmap_sem);
1060                 *locked = 0;
1061         }
1062         return pages_done;
1063 }
1064
1065 /*
1066  * We can leverage the VM_FAULT_RETRY functionality in the page fault
1067  * paths better by using either get_user_pages_locked() or
1068  * get_user_pages_unlocked().
1069  *
1070  * get_user_pages_locked() is suitable to replace the form:
1071  *
1072  *      down_read(&mm->mmap_sem);
1073  *      do_something()
1074  *      get_user_pages(tsk, mm, ..., pages, NULL);
1075  *      up_read(&mm->mmap_sem);
1076  *
1077  *  to:
1078  *
1079  *      int locked = 1;
1080  *      down_read(&mm->mmap_sem);
1081  *      do_something()
1082  *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
1083  *      if (locked)
1084  *          up_read(&mm->mmap_sem);
1085  */
1086 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1087                            unsigned int gup_flags, struct page **pages,
1088                            int *locked)
1089 {
1090         return __get_user_pages_locked(current, current->mm, start, nr_pages,
1091                                        pages, NULL, locked, true,
1092                                        gup_flags | FOLL_TOUCH);
1093 }
1094 EXPORT_SYMBOL(get_user_pages_locked);
1095
1096 /*
1097  * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
1098  * tsk, mm to be specified.
1099  *
1100  * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
1101  * caller if required (just like with __get_user_pages). "FOLL_GET"
1102  * is set implicitly if "pages" is non-NULL.
1103  */
1104 static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
1105                 struct mm_struct *mm, unsigned long start,
1106                 unsigned long nr_pages, struct page **pages,
1107                 unsigned int gup_flags)
1108 {
1109         long ret;
1110         int locked = 1;
1111
1112         down_read(&mm->mmap_sem);
1113         ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL,
1114                                       &locked, false, gup_flags);
1115         if (locked)
1116                 up_read(&mm->mmap_sem);
1117         return ret;
1118 }
1119
1120 /*
1121  * get_user_pages_unlocked() is suitable to replace the form:
1122  *
1123  *      down_read(&mm->mmap_sem);
1124  *      get_user_pages(tsk, mm, ..., pages, NULL);
1125  *      up_read(&mm->mmap_sem);
1126  *
1127  *  with:
1128  *
1129  *      get_user_pages_unlocked(tsk, mm, ..., pages);
1130  *
1131  * It is functionally equivalent to get_user_pages_fast so
1132  * get_user_pages_fast should be used instead if specific gup_flags
1133  * (e.g. FOLL_FORCE) are not required.
1134  */
1135 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1136                              struct page **pages, unsigned int gup_flags)
1137 {
1138         return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
1139                                          pages, gup_flags | FOLL_TOUCH);
1140 }
1141 EXPORT_SYMBOL(get_user_pages_unlocked);
1142
1143 /*
1144  * get_user_pages_remote() - pin user pages in memory
1145  * @tsk:        the task_struct to use for page fault accounting, or
1146  *              NULL if faults are not to be recorded.
1147  * @mm:         mm_struct of target mm
1148  * @start:      starting user address
1149  * @nr_pages:   number of pages from start to pin
1150  * @gup_flags:  flags modifying lookup behaviour
1151  * @pages:      array that receives pointers to the pages pinned.
1152  *              Should be at least nr_pages long. Or NULL, if caller
1153  *              only intends to ensure the pages are faulted in.
1154  * @vmas:       array of pointers to vmas corresponding to each page.
1155  *              Or NULL if the caller does not require them.
1156  * @locked:     pointer to lock flag indicating whether lock is held and
1157  *              subsequently whether VM_FAULT_RETRY functionality can be
1158  *              utilised. Lock must initially be held.
1159  *
1160  * Returns number of pages pinned. This may be fewer than the number
1161  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1162  * were pinned, returns -errno. Each page returned must be released
1163  * with a put_page() call when it is finished with. vmas will only
1164  * remain valid while mmap_sem is held.
1165  *
1166  * Must be called with mmap_sem held for read or write.
1167  *
1168  * get_user_pages walks a process's page tables and takes a reference to
1169  * each struct page that each user address corresponds to at a given
1170  * instant. That is, it takes the page that would be accessed if a user
1171  * thread accesses the given user virtual address at that instant.
1172  *
1173  * This does not guarantee that the page exists in the user mappings when
1174  * get_user_pages returns, and there may even be a completely different
1175  * page there in some cases (eg. if mmapped pagecache has been invalidated
1176  * and subsequently re faulted). However it does guarantee that the page
1177  * won't be freed completely. And mostly callers simply care that the page
1178  * contains data that was valid *at some point in time*. Typically, an IO
1179  * or similar operation cannot guarantee anything stronger anyway because
1180  * locks can't be held over the syscall boundary.
1181  *
1182  * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1183  * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1184  * be called after the page is finished with, and before put_page is called.
1185  *
1186  * get_user_pages is typically used for fewer-copy IO operations, to get a
1187  * handle on the memory by some means other than accesses via the user virtual
1188  * addresses. The pages may be submitted for DMA to devices or accessed via
1189  * their kernel linear mapping (via the kmap APIs). Care should be taken to
1190  * use the correct cache flushing APIs.
1191  *
1192  * See also get_user_pages_fast, for performance critical applications.
1193  *
1194  * get_user_pages should be phased out in favor of
1195  * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1196  * should use get_user_pages because it cannot pass
1197  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1198  */
1199 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1200                 unsigned long start, unsigned long nr_pages,
1201                 unsigned int gup_flags, struct page **pages,
1202                 struct vm_area_struct **vmas, int *locked)
1203 {
1204         return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1205                                        locked, true,
1206                                        gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1207 }
1208 EXPORT_SYMBOL(get_user_pages_remote);
1209
1210 /*
1211  * This is the same as get_user_pages_remote(), just with a
1212  * less-flexible calling convention where we assume that the task
1213  * and mm being operated on are the current task's and don't allow
1214  * passing of a locked parameter.  We also obviously don't pass
1215  * FOLL_REMOTE in here.
1216  */
1217 long get_user_pages(unsigned long start, unsigned long nr_pages,
1218                 unsigned int gup_flags, struct page **pages,
1219                 struct vm_area_struct **vmas)
1220 {
1221         return __get_user_pages_locked(current, current->mm, start, nr_pages,
1222                                        pages, vmas, NULL, false,
1223                                        gup_flags | FOLL_TOUCH);
1224 }
1225 EXPORT_SYMBOL(get_user_pages);
1226
1227 #ifdef CONFIG_FS_DAX
1228 /*
1229  * This is the same as get_user_pages() in that it assumes we are
1230  * operating on the current task's mm, but it goes further to validate
1231  * that the vmas associated with the address range are suitable for
1232  * longterm elevated page reference counts. For example, filesystem-dax
1233  * mappings are subject to the lifetime enforced by the filesystem and
1234  * we need guarantees that longterm users like RDMA and V4L2 only
1235  * establish mappings that have a kernel enforced revocation mechanism.
1236  *
1237  * "longterm" == userspace controlled elevated page count lifetime.
1238  * Contrast this to iov_iter_get_pages() usages which are transient.
1239  */
1240 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1241                 unsigned int gup_flags, struct page **pages,
1242                 struct vm_area_struct **vmas_arg)
1243 {
1244         struct vm_area_struct **vmas = vmas_arg;
1245         struct vm_area_struct *vma_prev = NULL;
1246         long rc, i;
1247
1248         if (!pages)
1249                 return -EINVAL;
1250
1251         if (!vmas) {
1252                 vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1253                                GFP_KERNEL);
1254                 if (!vmas)
1255                         return -ENOMEM;
1256         }
1257
1258         rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1259
1260         for (i = 0; i < rc; i++) {
1261                 struct vm_area_struct *vma = vmas[i];
1262
1263                 if (vma == vma_prev)
1264                         continue;
1265
1266                 vma_prev = vma;
1267
1268                 if (vma_is_fsdax(vma))
1269                         break;
1270         }
1271
1272         /*
1273          * Either get_user_pages() failed, or the vma validation
1274          * succeeded, in either case we don't need to put_page() before
1275          * returning.
1276          */
1277         if (i >= rc)
1278                 goto out;
1279
1280         for (i = 0; i < rc; i++)
1281                 put_page(pages[i]);
1282         rc = -EOPNOTSUPP;
1283 out:
1284         if (vmas != vmas_arg)
1285                 kfree(vmas);
1286         return rc;
1287 }
1288 EXPORT_SYMBOL(get_user_pages_longterm);
1289 #endif /* CONFIG_FS_DAX */
1290
1291 /**
1292  * populate_vma_page_range() -  populate a range of pages in the vma.
1293  * @vma:   target vma
1294  * @start: start address
1295  * @end:   end address
1296  * @nonblocking:
1297  *
1298  * This takes care of mlocking the pages too if VM_LOCKED is set.
1299  *
1300  * return 0 on success, negative error code on error.
1301  *
1302  * vma->vm_mm->mmap_sem must be held.
1303  *
1304  * If @nonblocking is NULL, it may be held for read or write and will
1305  * be unperturbed.
1306  *
1307  * If @nonblocking is non-NULL, it must held for read only and may be
1308  * released.  If it's released, *@nonblocking will be set to 0.
1309  */
1310 long populate_vma_page_range(struct vm_area_struct *vma,
1311                 unsigned long start, unsigned long end, int *nonblocking)
1312 {
1313         struct mm_struct *mm = vma->vm_mm;
1314         unsigned long nr_pages = (end - start) / PAGE_SIZE;
1315         int gup_flags;
1316
1317         VM_BUG_ON(start & ~PAGE_MASK);
1318         VM_BUG_ON(end   & ~PAGE_MASK);
1319         VM_BUG_ON_VMA(start < vma->vm_start, vma);
1320         VM_BUG_ON_VMA(end   > vma->vm_end, vma);
1321         VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1322
1323         gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1324         if (vma->vm_flags & VM_LOCKONFAULT)
1325                 gup_flags &= ~FOLL_POPULATE;
1326         /*
1327          * We want to touch writable mappings with a write fault in order
1328          * to break COW, except for shared mappings because these don't COW
1329          * and we would not want to dirty them for nothing.
1330          */
1331         if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1332                 gup_flags |= FOLL_WRITE;
1333
1334         /*
1335          * We want mlock to succeed for regions that have any permissions
1336          * other than PROT_NONE.
1337          */
1338         if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1339                 gup_flags |= FOLL_FORCE;
1340
1341         /*
1342          * We made sure addr is within a VMA, so the following will
1343          * not result in a stack expansion that recurses back here.
1344          */
1345         return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1346                                 NULL, NULL, nonblocking);
1347 }
1348
1349 /*
1350  * __mm_populate - populate and/or mlock pages within a range of address space.
1351  *
1352  * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1353  * flags. VMAs must be already marked with the desired vm_flags, and
1354  * mmap_sem must not be held.
1355  */
1356 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1357 {
1358         struct mm_struct *mm = current->mm;
1359         unsigned long end, nstart, nend;
1360         struct vm_area_struct *vma = NULL;
1361         int locked = 0;
1362         long ret = 0;
1363
1364         end = start + len;
1365
1366         for (nstart = start; nstart < end; nstart = nend) {
1367                 /*
1368                  * We want to fault in pages for [nstart; end) address range.
1369                  * Find first corresponding VMA.
1370                  */
1371                 if (!locked) {
1372                         locked = 1;
1373                         down_read(&mm->mmap_sem);
1374                         vma = find_vma(mm, nstart);
1375                 } else if (nstart >= vma->vm_end)
1376                         vma = vma->vm_next;
1377                 if (!vma || vma->vm_start >= end)
1378                         break;
1379                 /*
1380                  * Set [nstart; nend) to intersection of desired address
1381                  * range with the first VMA. Also, skip undesirable VMA types.
1382                  */
1383                 nend = min(end, vma->vm_end);
1384                 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1385                         continue;
1386                 if (nstart < vma->vm_start)
1387                         nstart = vma->vm_start;
1388                 /*
1389                  * Now fault in a range of pages. populate_vma_page_range()
1390                  * double checks the vma flags, so that it won't mlock pages
1391                  * if the vma was already munlocked.
1392                  */
1393                 ret = populate_vma_page_range(vma, nstart, nend, &locked);
1394                 if (ret < 0) {
1395                         if (ignore_errors) {
1396                                 ret = 0;
1397                                 continue;       /* continue at next VMA */
1398                         }
1399                         break;
1400                 }
1401                 nend = nstart + ret * PAGE_SIZE;
1402                 ret = 0;
1403         }
1404         if (locked)
1405                 up_read(&mm->mmap_sem);
1406         return ret;     /* 0 or negative error code */
1407 }
1408
1409 /**
1410  * get_dump_page() - pin user page in memory while writing it to core dump
1411  * @addr: user address
1412  *
1413  * Returns struct page pointer of user page pinned for dump,
1414  * to be freed afterwards by put_page().
1415  *
1416  * Returns NULL on any kind of failure - a hole must then be inserted into
1417  * the corefile, to preserve alignment with its headers; and also returns
1418  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1419  * allowing a hole to be left in the corefile to save diskspace.
1420  *
1421  * Called without mmap_sem, but after all other threads have been killed.
1422  */
1423 #ifdef CONFIG_ELF_CORE
1424 struct page *get_dump_page(unsigned long addr)
1425 {
1426         struct vm_area_struct *vma;
1427         struct page *page;
1428
1429         if (__get_user_pages(current, current->mm, addr, 1,
1430                              FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1431                              NULL) < 1)
1432                 return NULL;
1433         flush_cache_page(vma, addr, page_to_pfn(page));
1434         return page;
1435 }
1436 #endif /* CONFIG_ELF_CORE */
1437
1438 /*
1439  * Generic Fast GUP
1440  *
1441  * get_user_pages_fast attempts to pin user pages by walking the page
1442  * tables directly and avoids taking locks. Thus the walker needs to be
1443  * protected from page table pages being freed from under it, and should
1444  * block any THP splits.
1445  *
1446  * One way to achieve this is to have the walker disable interrupts, and
1447  * rely on IPIs from the TLB flushing code blocking before the page table
1448  * pages are freed. This is unsuitable for architectures that do not need
1449  * to broadcast an IPI when invalidating TLBs.
1450  *
1451  * Another way to achieve this is to batch up page table containing pages
1452  * belonging to more than one mm_user, then rcu_sched a callback to free those
1453  * pages. Disabling interrupts will allow the fast_gup walker to both block
1454  * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
1455  * (which is a relatively rare event). The code below adopts this strategy.
1456  *
1457  * Before activating this code, please be aware that the following assumptions
1458  * are currently made:
1459  *
1460  *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1461  *  free pages containing page tables or TLB flushing requires IPI broadcast.
1462  *
1463  *  *) ptes can be read atomically by the architecture.
1464  *
1465  *  *) access_ok is sufficient to validate userspace address ranges.
1466  *
1467  * The last two assumptions can be relaxed by the addition of helper functions.
1468  *
1469  * This code is based heavily on the PowerPC implementation by Nick Piggin.
1470  */
1471 #ifdef CONFIG_HAVE_GENERIC_GUP
1472
1473 #ifndef gup_get_pte
1474 /*
1475  * We assume that the PTE can be read atomically. If this is not the case for
1476  * your architecture, please provide the helper.
1477  */
1478 static inline pte_t gup_get_pte(pte_t *ptep)
1479 {
1480         return READ_ONCE(*ptep);
1481 }
1482 #endif
1483
1484 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1485 {
1486         while ((*nr) - nr_start) {
1487                 struct page *page = pages[--(*nr)];
1488
1489                 ClearPageReferenced(page);
1490                 put_page(page);
1491         }
1492 }
1493
1494 #ifdef __HAVE_ARCH_PTE_SPECIAL
1495 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1496                          int write, struct page **pages, int *nr)
1497 {
1498         struct dev_pagemap *pgmap = NULL;
1499         int nr_start = *nr, ret = 0;
1500         pte_t *ptep, *ptem;
1501
1502         ptem = ptep = pte_offset_map(&pmd, addr);
1503         do {
1504                 pte_t pte = gup_get_pte(ptep);
1505                 struct page *head, *page;
1506
1507                 /*
1508                  * Similar to the PMD case below, NUMA hinting must take slow
1509                  * path using the pte_protnone check.
1510                  */
1511                 if (pte_protnone(pte))
1512                         goto pte_unmap;
1513
1514                 if (!pte_access_permitted(pte, write))
1515                         goto pte_unmap;
1516
1517                 if (pte_devmap(pte)) {
1518                         pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1519                         if (unlikely(!pgmap)) {
1520                                 undo_dev_pagemap(nr, nr_start, pages);
1521                                 goto pte_unmap;
1522                         }
1523                 } else if (pte_special(pte))
1524                         goto pte_unmap;
1525
1526                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1527                 page = pte_page(pte);
1528                 head = compound_head(page);
1529
1530                 if (!page_cache_get_speculative(head))
1531                         goto pte_unmap;
1532
1533                 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1534                         put_page(head);
1535                         goto pte_unmap;
1536                 }
1537
1538                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1539
1540                 put_dev_pagemap(pgmap);
1541                 SetPageReferenced(page);
1542                 pages[*nr] = page;
1543                 (*nr)++;
1544
1545         } while (ptep++, addr += PAGE_SIZE, addr != end);
1546
1547         ret = 1;
1548
1549 pte_unmap:
1550         pte_unmap(ptem);
1551         return ret;
1552 }
1553 #else
1554
1555 /*
1556  * If we can't determine whether or not a pte is special, then fail immediately
1557  * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
1558  * to be special.
1559  *
1560  * For a futex to be placed on a THP tail page, get_futex_key requires a
1561  * __get_user_pages_fast implementation that can pin pages. Thus it's still
1562  * useful to have gup_huge_pmd even if we can't operate on ptes.
1563  */
1564 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1565                          int write, struct page **pages, int *nr)
1566 {
1567         return 0;
1568 }
1569 #endif /* __HAVE_ARCH_PTE_SPECIAL */
1570
1571 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1572 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1573                 unsigned long end, struct page **pages, int *nr)
1574 {
1575         int nr_start = *nr;
1576         struct dev_pagemap *pgmap = NULL;
1577
1578         do {
1579                 struct page *page = pfn_to_page(pfn);
1580
1581                 pgmap = get_dev_pagemap(pfn, pgmap);
1582                 if (unlikely(!pgmap)) {
1583                         undo_dev_pagemap(nr, nr_start, pages);
1584                         return 0;
1585                 }
1586                 SetPageReferenced(page);
1587                 pages[*nr] = page;
1588                 get_page(page);
1589                 put_dev_pagemap(pgmap);
1590                 (*nr)++;
1591                 pfn++;
1592         } while (addr += PAGE_SIZE, addr != end);
1593         return 1;
1594 }
1595
1596 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1597                 unsigned long end, struct page **pages, int *nr)
1598 {
1599         unsigned long fault_pfn;
1600         int nr_start = *nr;
1601
1602         fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1603         if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1604                 return 0;
1605
1606         if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1607                 undo_dev_pagemap(nr, nr_start, pages);
1608                 return 0;
1609         }
1610         return 1;
1611 }
1612
1613 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1614                 unsigned long end, struct page **pages, int *nr)
1615 {
1616         unsigned long fault_pfn;
1617         int nr_start = *nr;
1618
1619         fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1620         if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
1621                 return 0;
1622
1623         if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1624                 undo_dev_pagemap(nr, nr_start, pages);
1625                 return 0;
1626         }
1627         return 1;
1628 }
1629 #else
1630 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1631                 unsigned long end, struct page **pages, int *nr)
1632 {
1633         BUILD_BUG();
1634         return 0;
1635 }
1636
1637 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1638                 unsigned long end, struct page **pages, int *nr)
1639 {
1640         BUILD_BUG();
1641         return 0;
1642 }
1643 #endif
1644
1645 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1646                 unsigned long end, int write, struct page **pages, int *nr)
1647 {
1648         struct page *head, *page;
1649         int refs;
1650
1651         if (!pmd_access_permitted(orig, write))
1652                 return 0;
1653
1654         if (pmd_devmap(orig))
1655                 return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1656
1657         refs = 0;
1658         page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1659         do {
1660                 pages[*nr] = page;
1661                 (*nr)++;
1662                 page++;
1663                 refs++;
1664         } while (addr += PAGE_SIZE, addr != end);
1665
1666         head = compound_head(pmd_page(orig));
1667         if (!page_cache_add_speculative(head, refs)) {
1668                 *nr -= refs;
1669                 return 0;
1670         }
1671
1672         if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1673                 *nr -= refs;
1674                 while (refs--)
1675                         put_page(head);
1676                 return 0;
1677         }
1678
1679         SetPageReferenced(head);
1680         return 1;
1681 }
1682
1683 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1684                 unsigned long end, int write, struct page **pages, int *nr)
1685 {
1686         struct page *head, *page;
1687         int refs;
1688
1689         if (!pud_access_permitted(orig, write))
1690                 return 0;
1691
1692         if (pud_devmap(orig))
1693                 return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1694
1695         refs = 0;
1696         page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1697         do {
1698                 pages[*nr] = page;
1699                 (*nr)++;
1700                 page++;
1701                 refs++;
1702         } while (addr += PAGE_SIZE, addr != end);
1703
1704         head = compound_head(pud_page(orig));
1705         if (!page_cache_add_speculative(head, refs)) {
1706                 *nr -= refs;
1707                 return 0;
1708         }
1709
1710         if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1711                 *nr -= refs;
1712                 while (refs--)
1713                         put_page(head);
1714                 return 0;
1715         }
1716
1717         SetPageReferenced(head);
1718         return 1;
1719 }
1720
1721 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1722                         unsigned long end, int write,
1723                         struct page **pages, int *nr)
1724 {
1725         int refs;
1726         struct page *head, *page;
1727
1728         if (!pgd_access_permitted(orig, write))
1729                 return 0;
1730
1731         BUILD_BUG_ON(pgd_devmap(orig));
1732         refs = 0;
1733         page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1734         do {
1735                 pages[*nr] = page;
1736                 (*nr)++;
1737                 page++;
1738                 refs++;
1739         } while (addr += PAGE_SIZE, addr != end);
1740
1741         head = compound_head(pgd_page(orig));
1742         if (!page_cache_add_speculative(head, refs)) {
1743                 *nr -= refs;
1744                 return 0;
1745         }
1746
1747         if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1748                 *nr -= refs;
1749                 while (refs--)
1750                         put_page(head);
1751                 return 0;
1752         }
1753
1754         SetPageReferenced(head);
1755         return 1;
1756 }
1757
1758 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1759                 int write, struct page **pages, int *nr)
1760 {
1761         unsigned long next;
1762         pmd_t *pmdp;
1763
1764         pmdp = pmd_offset(&pud, addr);
1765         do {
1766                 pmd_t pmd = READ_ONCE(*pmdp);
1767
1768                 next = pmd_addr_end(addr, end);
1769                 if (!pmd_present(pmd))
1770                         return 0;
1771
1772                 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
1773                              pmd_devmap(pmd))) {
1774                         /*
1775                          * NUMA hinting faults need to be handled in the GUP
1776                          * slowpath for accounting purposes and so that they
1777                          * can be serialised against THP migration.
1778                          */
1779                         if (pmd_protnone(pmd))
1780                                 return 0;
1781
1782                         if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
1783                                 pages, nr))
1784                                 return 0;
1785
1786                 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
1787                         /*
1788                          * architecture have different format for hugetlbfs
1789                          * pmd format and THP pmd format
1790                          */
1791                         if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
1792                                          PMD_SHIFT, next, write, pages, nr))
1793                                 return 0;
1794                 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
1795                                 return 0;
1796         } while (pmdp++, addr = next, addr != end);
1797
1798         return 1;
1799 }
1800
1801 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
1802                          int write, struct page **pages, int *nr)
1803 {
1804         unsigned long next;
1805         pud_t *pudp;
1806
1807         pudp = pud_offset(&p4d, addr);
1808         do {
1809                 pud_t pud = READ_ONCE(*pudp);
1810
1811                 next = pud_addr_end(addr, end);
1812                 if (pud_none(pud))
1813                         return 0;
1814                 if (unlikely(pud_huge(pud))) {
1815                         if (!gup_huge_pud(pud, pudp, addr, next, write,
1816                                           pages, nr))
1817                                 return 0;
1818                 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
1819                         if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
1820                                          PUD_SHIFT, next, write, pages, nr))
1821                                 return 0;
1822                 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
1823                         return 0;
1824         } while (pudp++, addr = next, addr != end);
1825
1826         return 1;
1827 }
1828
1829 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
1830                          int write, struct page **pages, int *nr)
1831 {
1832         unsigned long next;
1833         p4d_t *p4dp;
1834
1835         p4dp = p4d_offset(&pgd, addr);
1836         do {
1837                 p4d_t p4d = READ_ONCE(*p4dp);
1838
1839                 next = p4d_addr_end(addr, end);
1840                 if (p4d_none(p4d))
1841                         return 0;
1842                 BUILD_BUG_ON(p4d_huge(p4d));
1843                 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
1844                         if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
1845                                          P4D_SHIFT, next, write, pages, nr))
1846                                 return 0;
1847                 } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
1848                         return 0;
1849         } while (p4dp++, addr = next, addr != end);
1850
1851         return 1;
1852 }
1853
1854 static void gup_pgd_range(unsigned long addr, unsigned long end,
1855                 int write, struct page **pages, int *nr)
1856 {
1857         unsigned long next;
1858         pgd_t *pgdp;
1859
1860         pgdp = pgd_offset(current->mm, addr);
1861         do {
1862                 pgd_t pgd = READ_ONCE(*pgdp);
1863
1864                 next = pgd_addr_end(addr, end);
1865                 if (pgd_none(pgd))
1866                         return;
1867                 if (unlikely(pgd_huge(pgd))) {
1868                         if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
1869                                           pages, nr))
1870                                 return;
1871                 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
1872                         if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1873                                          PGDIR_SHIFT, next, write, pages, nr))
1874                                 return;
1875                 } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
1876                         return;
1877         } while (pgdp++, addr = next, addr != end);
1878 }
1879
1880 #ifndef gup_fast_permitted
1881 /*
1882  * Check if it's allowed to use __get_user_pages_fast() for the range, or
1883  * we need to fall back to the slow version:
1884  */
1885 bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
1886 {
1887         unsigned long len, end;
1888
1889         len = (unsigned long) nr_pages << PAGE_SHIFT;
1890         end = start + len;
1891         return end >= start;
1892 }
1893 #endif
1894
1895 /*
1896  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
1897  * the regular GUP. It will only return non-negative values.
1898  */
1899 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1900                           struct page **pages)
1901 {
1902         unsigned long addr, len, end;
1903         unsigned long flags;
1904         int nr = 0;
1905
1906         start &= PAGE_MASK;
1907         addr = start;
1908         len = (unsigned long) nr_pages << PAGE_SHIFT;
1909         end = start + len;
1910
1911         if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1912                                         (void __user *)start, len)))
1913                 return 0;
1914
1915         /*
1916          * Disable interrupts.  We use the nested form as we can already have
1917          * interrupts disabled by get_futex_key.
1918          *
1919          * With interrupts disabled, we block page table pages from being
1920          * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
1921          * for more details.
1922          *
1923          * We do not adopt an rcu_read_lock(.) here as we also want to
1924          * block IPIs that come from THPs splitting.
1925          */
1926
1927         if (gup_fast_permitted(start, nr_pages, write)) {
1928                 local_irq_save(flags);
1929                 gup_pgd_range(addr, end, write, pages, &nr);
1930                 local_irq_restore(flags);
1931         }
1932
1933         return nr;
1934 }
1935
1936 /**
1937  * get_user_pages_fast() - pin user pages in memory
1938  * @start:      starting user address
1939  * @nr_pages:   number of pages from start to pin
1940  * @write:      whether pages will be written to
1941  * @pages:      array that receives pointers to the pages pinned.
1942  *              Should be at least nr_pages long.
1943  *
1944  * Attempt to pin user pages in memory without taking mm->mmap_sem.
1945  * If not successful, it will fall back to taking the lock and
1946  * calling get_user_pages().
1947  *
1948  * Returns number of pages pinned. This may be fewer than the number
1949  * requested. If nr_pages is 0 or negative, returns 0. If no pages
1950  * were pinned, returns -errno.
1951  */
1952 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1953                         struct page **pages)
1954 {
1955         unsigned long addr, len, end;
1956         int nr = 0, ret = 0;
1957
1958         start &= PAGE_MASK;
1959         addr = start;
1960         len = (unsigned long) nr_pages << PAGE_SHIFT;
1961         end = start + len;
1962
1963         if (nr_pages <= 0)
1964                 return 0;
1965
1966         if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1967                                         (void __user *)start, len)))
1968                 return -EFAULT;
1969
1970         if (gup_fast_permitted(start, nr_pages, write)) {
1971                 local_irq_disable();
1972                 gup_pgd_range(addr, end, write, pages, &nr);
1973                 local_irq_enable();
1974                 ret = nr;
1975         }
1976
1977         if (nr < nr_pages) {
1978                 /* Try to get the remaining pages with get_user_pages */
1979                 start += nr << PAGE_SHIFT;
1980                 pages += nr;
1981
1982                 ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
1983                                 write ? FOLL_WRITE : 0);
1984
1985                 /* Have to be a bit careful with return values */
1986                 if (nr > 0) {
1987                         if (ret < 0)
1988                                 ret = nr;
1989                         else
1990                                 ret += nr;
1991                 }
1992         }
1993
1994         return ret;
1995 }
1996
1997 #endif /* CONFIG_HAVE_GENERIC_GUP */