mm/hmm/mirror: device page fault handler
authorJérôme Glisse <jglisse@redhat.com>
Fri, 8 Sep 2017 23:11:35 +0000 (16:11 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 Sep 2017 01:26:46 +0000 (18:26 -0700)
This handles page fault on behalf of device driver, unlike
handle_mm_fault() it does not trigger migration back to system memory for
device memory.

Link: http://lkml.kernel.org/r/20170817000548.32038-6-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/hmm.h
mm/hmm.c

index 62899c9829c98ecc53a33aab07b86704ddf51b54..61a6535fe438bef862c6df2d72f3fc17a4b5c60b 100644 (file)
@@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
                     unsigned long end,
                     hmm_pfn_t *pfns);
 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+                 struct hmm_range *range,
+                 unsigned long start,
+                 unsigned long end,
+                 hmm_pfn_t *pfns,
+                 bool write,
+                 bool block);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
index 172984848d51910a5fda8dbffe38898ccd4b87aa..f6c745b9a25ae1f4a31dd45fc5b9cb895832f48d 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -221,6 +221,36 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
+struct hmm_vma_walk {
+       struct hmm_range        *range;
+       unsigned long           last;
+       bool                    fault;
+       bool                    block;
+       bool                    write;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+                           unsigned long addr,
+                           hmm_pfn_t *pfn)
+{
+       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       int r;
+
+       flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+       flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+       r = handle_mm_fault(vma, addr, flags);
+       if (r & VM_FAULT_RETRY)
+               return -EBUSY;
+       if (r & VM_FAULT_ERROR) {
+               *pfn = HMM_PFN_ERROR;
+               return -EFAULT;
+       }
+
+       return -EAGAIN;
+}
+
 static void hmm_pfns_special(hmm_pfn_t *pfns,
                             unsigned long addr,
                             unsigned long end)
@@ -244,34 +274,62 @@ static int hmm_pfns_bad(unsigned long addr,
        return 0;
 }
 
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+                          unsigned long addr,
+                          unsigned long end)
+{
+       for (; addr < end; addr += PAGE_SIZE, pfns++)
+               *pfns = 0;
+}
+
 static int hmm_vma_walk_hole(unsigned long addr,
                             unsigned long end,
                             struct mm_walk *walk)
 {
-       struct hmm_range *range = walk->private;
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct hmm_range *range = hmm_vma_walk->range;
        hmm_pfn_t *pfns = range->pfns;
        unsigned long i;
 
+       hmm_vma_walk->last = addr;
        i = (addr - range->start) >> PAGE_SHIFT;
-       for (; addr < end; addr += PAGE_SIZE, i++)
+       for (; addr < end; addr += PAGE_SIZE, i++) {
                pfns[i] = HMM_PFN_EMPTY;
+               if (hmm_vma_walk->fault) {
+                       int ret;
 
-       return 0;
+                       ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+                       if (ret != -EAGAIN)
+                               return ret;
+               }
+       }
+
+       return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_clear(unsigned long addr,
                              unsigned long end,
                              struct mm_walk *walk)
 {
-       struct hmm_range *range = walk->private;
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct hmm_range *range = hmm_vma_walk->range;
        hmm_pfn_t *pfns = range->pfns;
        unsigned long i;
 
+       hmm_vma_walk->last = addr;
        i = (addr - range->start) >> PAGE_SHIFT;
-       for (; addr < end; addr += PAGE_SIZE, i++)
+       for (; addr < end; addr += PAGE_SIZE, i++) {
                pfns[i] = 0;
+               if (hmm_vma_walk->fault) {
+                       int ret;
 
-       return 0;
+                       ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+                       if (ret != -EAGAIN)
+                               return ret;
+               }
+       }
+
+       return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -279,15 +337,18 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
                            unsigned long end,
                            struct mm_walk *walk)
 {
-       struct hmm_range *range = walk->private;
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct hmm_range *range = hmm_vma_walk->range;
        struct vm_area_struct *vma = walk->vma;
        hmm_pfn_t *pfns = range->pfns;
        unsigned long addr = start, i;
+       bool write_fault;
        hmm_pfn_t flag;
        pte_t *ptep;
 
        i = (addr - range->start) >> PAGE_SHIFT;
        flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+       write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
 
 again:
        if (pmd_none(*pmdp))
@@ -316,6 +377,9 @@ again:
                if (pmd_protnone(pmd))
                        return hmm_vma_walk_clear(start, end, walk);
 
+               if (write_fault && !pmd_write(pmd))
+                       return hmm_vma_walk_clear(start, end, walk);
+
                pfn = pmd_pfn(pmd) + pte_index(addr);
                flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
                for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
@@ -332,13 +396,55 @@ again:
 
                pfns[i] = 0;
 
-               if (pte_none(pte) || !pte_present(pte)) {
+               if (pte_none(pte)) {
                        pfns[i] = HMM_PFN_EMPTY;
+                       if (hmm_vma_walk->fault)
+                               goto fault;
                        continue;
                }
 
+               if (!pte_present(pte)) {
+                       swp_entry_t entry;
+
+                       if (!non_swap_entry(entry)) {
+                               if (hmm_vma_walk->fault)
+                                       goto fault;
+                               continue;
+                       }
+
+                       entry = pte_to_swp_entry(pte);
+
+                       /*
+                        * This is a special swap entry, ignore migration, use
+                        * device and report anything else as error.
+                        */
+                       if (is_migration_entry(entry)) {
+                               if (hmm_vma_walk->fault) {
+                                       pte_unmap(ptep);
+                                       hmm_vma_walk->last = addr;
+                                       migration_entry_wait(vma->vm_mm,
+                                                            pmdp, addr);
+                                       return -EAGAIN;
+                               }
+                               continue;
+                       } else {
+                               /* Report error for everything else */
+                               pfns[i] = HMM_PFN_ERROR;
+                       }
+                       continue;
+               }
+
+               if (write_fault && !pte_write(pte))
+                       goto fault;
+
                pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
                pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+               continue;
+
+fault:
+               pte_unmap(ptep);
+               /* Fault all pages in range */
+               return hmm_vma_walk_clear(start, end, walk);
        }
        pte_unmap(ptep - 1);
 
@@ -371,6 +477,7 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
                     unsigned long end,
                     hmm_pfn_t *pfns)
 {
+       struct hmm_vma_walk hmm_vma_walk;
        struct mm_walk mm_walk;
        struct hmm *hmm;
 
@@ -402,9 +509,12 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
        list_add_rcu(&range->list, &hmm->ranges);
        spin_unlock(&hmm->lock);
 
+       hmm_vma_walk.fault = false;
+       hmm_vma_walk.range = range;
+       mm_walk.private = &hmm_vma_walk;
+
        mm_walk.vma = vma;
        mm_walk.mm = vma->vm_mm;
-       mm_walk.private = range;
        mm_walk.pte_entry = NULL;
        mm_walk.test_walk = NULL;
        mm_walk.hugetlb_entry = NULL;
@@ -412,7 +522,6 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
        mm_walk.pte_hole = hmm_vma_walk_hole;
 
        walk_page_range(start, end, &mm_walk);
-
        return 0;
 }
 EXPORT_SYMBOL(hmm_vma_get_pfns);
@@ -439,7 +548,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
  *
  * There are two ways to use this :
  * again:
- *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  *   trans = device_build_page_table_update_transaction(pfns);
  *   device_page_table_lock();
  *   if (!hmm_vma_range_done(vma, range)) {
@@ -450,7 +559,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
  *   device_page_table_unlock();
  *
  * Or:
- *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  *   device_page_table_lock();
  *   hmm_vma_range_done(vma, range);
  *   device_update_page_table(pfns);
@@ -479,4 +588,127 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
        return range->valid;
 }
 EXPORT_SYMBOL(hmm_vma_range_done);
+
+/*
+ * hmm_vma_fault() - try to fault some address in a virtual address range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: use to track pfns array content validity
+ * @start: fault range virtual start address (inclusive)
+ * @end: fault range virtual end address (exclusive)
+ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
+ * @write: is it a write fault
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ *
+ * This is similar to a regular CPU page fault except that it will not trigger
+ * any memory migration if the memory being faulted is not accessible by CPUs.
+ *
+ * On error, for one virtual address in the range, the function will set the
+ * hmm_pfn_t error flag for the corresponding pfn entry.
+ *
+ * Expected use pattern:
+ * retry:
+ *   down_read(&mm->mmap_sem);
+ *   // Find vma and address device wants to fault, initialize hmm_pfn_t
+ *   // array accordingly
+ *   ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ *   switch (ret) {
+ *   case -EAGAIN:
+ *     hmm_vma_range_done(vma, range);
+ *     // You might want to rate limit or yield to play nicely, you may
+ *     // also commit any valid pfn in the array assuming that you are
+ *     // getting true from hmm_vma_range_monitor_end()
+ *     goto retry;
+ *   case 0:
+ *     break;
+ *   default:
+ *     // Handle error !
+ *     up_read(&mm->mmap_sem)
+ *     return;
+ *   }
+ *   // Take device driver lock that serialize device page table update
+ *   driver_lock_device_page_table_update();
+ *   hmm_vma_range_done(vma, range);
+ *   // Commit pfns we got from hmm_vma_fault()
+ *   driver_unlock_device_page_table_update();
+ *   up_read(&mm->mmap_sem)
+ *
+ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
+ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
+ *
+ * YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+                 struct hmm_range *range,
+                 unsigned long start,
+                 unsigned long end,
+                 hmm_pfn_t *pfns,
+                 bool write,
+                 bool block)
+{
+       struct hmm_vma_walk hmm_vma_walk;
+       struct mm_walk mm_walk;
+       struct hmm *hmm;
+       int ret;
+
+       /* Sanity check, this really should not happen ! */
+       if (start < vma->vm_start || start >= vma->vm_end)
+               return -EINVAL;
+       if (end < vma->vm_start || end > vma->vm_end)
+               return -EINVAL;
+
+       hmm = hmm_register(vma->vm_mm);
+       if (!hmm) {
+               hmm_pfns_clear(pfns, start, end);
+               return -ENOMEM;
+       }
+       /* Caller must have registered a mirror using hmm_mirror_register() */
+       if (!hmm->mmu_notifier.ops)
+               return -EINVAL;
+
+       /* Initialize range to track CPU page table update */
+       range->start = start;
+       range->pfns = pfns;
+       range->end = end;
+       spin_lock(&hmm->lock);
+       range->valid = true;
+       list_add_rcu(&range->list, &hmm->ranges);
+       spin_unlock(&hmm->lock);
+
+       /* FIXME support hugetlb fs */
+       if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+               hmm_pfns_special(pfns, start, end);
+               return 0;
+       }
+
+       hmm_vma_walk.fault = true;
+       hmm_vma_walk.write = write;
+       hmm_vma_walk.block = block;
+       hmm_vma_walk.range = range;
+       mm_walk.private = &hmm_vma_walk;
+       hmm_vma_walk.last = range->start;
+
+       mm_walk.vma = vma;
+       mm_walk.mm = vma->vm_mm;
+       mm_walk.pte_entry = NULL;
+       mm_walk.test_walk = NULL;
+       mm_walk.hugetlb_entry = NULL;
+       mm_walk.pmd_entry = hmm_vma_walk_pmd;
+       mm_walk.pte_hole = hmm_vma_walk_hole;
+
+       do {
+               ret = walk_page_range(start, end, &mm_walk);
+               start = hmm_vma_walk.last;
+       } while (ret == -EAGAIN);
+
+       if (ret) {
+               unsigned long i;
+
+               i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+               hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
+               hmm_vma_range_done(vma, range);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(hmm_vma_fault);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */