mm,fs,dax: change ->pmd_fault to ->huge_fault
authorDave Jiang <dave.jiang@intel.com>
Fri, 24 Feb 2017 22:56:59 +0000 (14:56 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 25 Feb 2017 01:46:54 +0000 (17:46 -0800)
Patch series "1G transparent hugepage support for device dax", v2.

The following series implements support for 1G trasparent hugepage on
x86 for device dax.  The bulk of the code was written by Mathew Wilcox a
while back supporting transparent 1G hugepage for fs DAX.  I have
forward ported the relevant bits to 4.10-rc.  The current submission has
only the necessary code to support device DAX.

Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs.  Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file.  We have
customer feedback to that effect as Willy mentioned in his previous
version of these patches [1].

[1]: https://lkml.org/lkml/2016/1/31/52

Comments from Nilesh @ Oracle:

There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:

processes         : 10,000
memory            :    6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB

As you can see with 2M pages, this system will use up an exorbitant
amount of DRAM to hold the page tables; but the 1G pages finally brings
it down to a reasonable level.  Memory sizes will keep increasing; so
this number will keep increasing.

An argument can be made to convert the applications from process model
to thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.

This patch (of 3):

In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault.  The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.

[ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()]
Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com
[dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path]
Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
drivers/dax/dax.c
fs/dax.c
fs/ext2/file.c
fs/ext4/file.c
fs/xfs/xfs_file.c
fs/xfs/xfs_trace.h
include/linux/dax.h
include/linux/mm.h
mm/memory.c

index 0261f332bf3ec88f839849eb4574826a2ef4ff2c..922ec461dcaa323b35f7b09191c4eeda28e249c0 100644 (file)
@@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
        return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
        struct device *dev = &dax_dev->dev;
        struct dax_region *dax_region;
@@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
        return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_fault *vmf)
-{
-       struct vm_area_struct *vma = vmf->vma;
-       int rc;
-       struct file *filp = vma->vm_file;
-       struct dax_dev *dax_dev = filp->private_data;
-
-       dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-                       current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-                       ? "write" : "read", vma->vm_start, vma->vm_end);
-       rcu_read_lock();
-       rc = __dax_dev_fault(dax_dev, vmf);
-       rcu_read_unlock();
-
-       return rc;
-}
-
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
        unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
                        vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
        int rc;
        struct file *filp = vmf->vma->vm_file;
@@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
                        vmf->vma->vm_start, vmf->vma->vm_end);
 
        rcu_read_lock();
-       rc = __dax_dev_pmd_fault(dax_dev, vmf);
+       switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+       case FAULT_FLAG_SIZE_PTE:
+               rc = __dax_dev_pte_fault(dax_dev, vmf);
+               break;
+       case FAULT_FLAG_SIZE_PMD:
+               rc = __dax_dev_pmd_fault(dax_dev, vmf);
+               break;
+       default:
+               return VM_FAULT_FALLBACK;
+       }
        rcu_read_unlock();
 
        return rc;
@@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
 
 static const struct vm_operations_struct dax_dev_vm_ops = {
        .fault = dax_dev_fault,
-       .pmd_fault = dax_dev_pmd_fault,
+       .huge_fault = dax_dev_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
index f955c0df33bbf29530f331008c1cfa9f1a0b6932..c3c29fbf64be09dc60a4f517c97745068feb9ac5 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1118,16 +1118,8 @@ static int dax_fault_return(int error)
        return VM_FAULT_SIGBUS;
 }
 
-/**
- * dax_iomap_fault - handle a page fault on a DAX file
- * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
- *
- * When a page fault occurs, filesystems may call this helper in their fault
- * or mkwrite handler for DAX files. Assumes the caller has done all the
- * necessary locking for the page fault to proceed successfully.
- */
-int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pte_fault(struct vm_fault *vmf,
+                              const struct iomap_ops *ops)
 {
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
@@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
        }
        return vmf_ret;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
 #ifdef CONFIG_FS_DAX_PMD
 /*
@@ -1335,7 +1326,8 @@ fallback:
        return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+                              const struct iomap_ops *ops)
 {
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1443,5 +1435,32 @@ out:
        trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
        return result;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#else
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+{
+       return VM_FAULT_FALLBACK;
+}
 #endif /* CONFIG_FS_DAX_PMD */
+
+/**
+ * dax_iomap_fault - handle a page fault on a DAX file
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in
+ * their fault handler for DAX files. dax_iomap_fault() assumes the caller
+ * has done all the necessary locking for page fault to proceed
+ * successfully.
+ */
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+{
+       switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+       case FAULT_FLAG_SIZE_PTE:
+               return dax_iomap_pte_fault(vmf, ops);
+       case FAULT_FLAG_SIZE_PMD:
+               return dax_iomap_pmd_fault(vmf, ops);
+       default:
+               return VM_FAULT_FALLBACK;
+       }
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
index 0bf0d971205a5693334c448ac36dba907c9de8e1..68738832beda615dff8f926882d32692b5bc4a70 100644 (file)
@@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 static const struct vm_operations_struct ext2_dax_vm_ops = {
        .fault          = ext2_dax_fault,
        /*
-        * .pmd_fault is not supported for DAX because allocation in ext2
+        * .huge_fault is not supported for DAX because allocation in ext2
         * cannot be reliably aligned to huge page sizes and so pmd faults
         * will always fail and fail back to regular faults.
         */
index 21e1f17fe36da1a611e2e1ba0552245a6dd7d889..502d2d07d19116cbf5283d5b591187778c5f8707 100644 (file)
@@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf)
        return result;
 }
 
-static int
-ext4_dax_pmd_fault(struct vm_fault *vmf)
-{
-       int result;
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       struct super_block *sb = inode->i_sb;
-       bool write = vmf->flags & FAULT_FLAG_WRITE;
-
-       if (write) {
-               sb_start_pagefault(sb);
-               file_update_time(vmf->vma->vm_file);
-       }
-       down_read(&EXT4_I(inode)->i_mmap_sem);
-       result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
-       up_read(&EXT4_I(inode)->i_mmap_sem);
-       if (write)
-               sb_end_pagefault(sb);
-
-       return result;
-}
-
 /*
  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
@@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
-       .pmd_fault      = ext4_dax_pmd_fault,
+       .huge_fault     = ext4_dax_fault,
        .page_mkwrite   = ext4_dax_fault,
        .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
 };
index 9cc10136ba0b9838258ecf71a065262954dcd3e8..990e0381937072b5bcf9fb515ce3ab99e4e64a95 100644 (file)
@@ -1429,12 +1429,12 @@ xfs_filemap_fault(
 /*
  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
  * both read and write faults. Hence we need to handle both cases. There is no
- * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * ->huge_mkwrite callout for huge pages, so we have a single function here to
  * handle both cases here. @flags carries the information on the type of fault
  * occuring.
  */
 STATIC int
-xfs_filemap_pmd_fault(
+xfs_filemap_huge_fault(
        struct vm_fault         *vmf)
 {
        struct inode            *inode = file_inode(vmf->vma->vm_file);
@@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault(
        if (!IS_DAX(inode))
                return VM_FAULT_FALLBACK;
 
-       trace_xfs_filemap_pmd_fault(ip);
+       trace_xfs_filemap_huge_fault(ip);
 
        if (vmf->flags & FAULT_FLAG_WRITE) {
                sb_start_pagefault(inode->i_sb);
@@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault(
        }
 
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
+       ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
        if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite(
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = xfs_filemap_fault,
-       .pmd_fault      = xfs_filemap_pmd_fault,
+       .huge_fault     = xfs_filemap_huge_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_filemap_page_mkwrite,
        .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
index fb7555e73a62ef586a13979d1296df5eb95cb905..383ac227ce2c324cee29b5e8a209fa6b98583192 100644 (file)
@@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
+DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
 
index eeb02421c848ccb21ef81914a425da4427b53adf..cf9af225962b1bb937ec963bf623a984ff7578aa 100644 (file)
@@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry)
                return PMD_SHIFT - PAGE_SHIFT;
        return 0;
 }
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
        return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
-               const struct iomap_ops *ops)
-{
-       return VM_FAULT_FALLBACK;
-}
 #endif
 int dax_pfn_mkwrite(struct vm_fault *vmf);
 
index 3dd80ba6568a0a9235beba71468aa3b814a1d608..035a688e5472b01e925e3d8c7a1e440fb4294b5b 100644 (file)
@@ -285,6 +285,11 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE      0x80    /* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction fetch */
 
+#define FAULT_FLAG_SIZE_MASK   0x7000  /* Support up to 8-level page tables */
+#define FAULT_FLAG_SIZE_PTE    0x0000  /* First level (eg 4k) */
+#define FAULT_FLAG_SIZE_PMD    0x1000  /* Second level (eg 2MB) */
+#define FAULT_FLAG_SIZE_PUD    0x2000  /* Third level (eg 1GB) */
+
 #define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,             "WRITE" }, \
        { FAULT_FLAG_MKWRITE,           "MKWRITE" }, \
@@ -314,6 +319,9 @@ struct vm_fault {
        unsigned long address;          /* Faulting virtual address */
        pmd_t *pmd;                     /* Pointer to pmd entry matching
                                         * the 'address' */
+       pud_t *pud;                     /* Pointer to pud entry matching
+                                        * the 'address'
+                                        */
        pte_t orig_pte;                 /* Value of PTE at the time of fault */
 
        struct page *cow_page;          /* Page handler may use for COW fault */
@@ -351,7 +359,7 @@ struct vm_operations_struct {
        void (*close)(struct vm_area_struct * area);
        int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_fault *vmf);
-       int (*pmd_fault)(struct vm_fault *vmf);
+       int (*huge_fault)(struct vm_fault *vmf);
        void (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
 
index cf97d88158cd38607ff7917d05a91e9524207358..e721e8eba5703382de135084d760cc96ed1ccb88 100644 (file)
@@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
 {
        if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_anonymous_page(vmf);
-       if (vmf->vma->vm_ops->pmd_fault)
-               return vmf->vma->vm_ops->pmd_fault(vmf);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf);
        return VM_FAULT_FALLBACK;
 }
 
@@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
        if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_wp_page(vmf, orig_pmd);
-       if (vmf->vma->vm_ops->pmd_fault)
-               return vmf->vma->vm_ops->pmd_fault(vmf);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf);
 
        /* COW handled on pte level: split pmd */
        VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        pud_t *pud;
+       int ret;
 
        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
@@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        if (!vmf.pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = create_huge_pmd(&vmf);
+               vmf.flags |= FAULT_FLAG_SIZE_PMD;
+               ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
+               /* fall through path, remove PMD flag */
+               vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
        } else {
                pmd_t orig_pmd = *vmf.pmd;
-               int ret;
 
                barrier();
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+                       vmf.flags |= FAULT_FLAG_SIZE_PMD;
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
@@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
+                               /* fall through path, remove PUD flag */
+                               vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
                        } else {
                                huge_pmd_set_accessed(&vmf, orig_pmd);
                                return 0;