From a2d581675d485eb7188f521f36efc114639a3096 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:59 -0800 Subject: [PATCH] mm,fs,dax: change ->pmd_fault to ->huge_fault Patch series "1G transparent hugepage support for device dax", v2. The following series implements support for 1G trasparent hugepage on x86 for device dax. The bulk of the code was written by Mathew Wilcox a while back supporting transparent 1G hugepage for fs DAX. I have forward ported the relevant bits to 4.10-rc. The current submission has only the necessary code to support device DAX. Comments from Dan Williams: So the motivation and intended user of this functionality mirrors the motivation and users of 1GB page support in hugetlbfs. Given expected capacities of persistent memory devices an in-memory database may want to reduce tlb pressure beyond what they can already achieve with 2MB mappings of a device-dax file. We have customer feedback to that effect as Willy mentioned in his previous version of these patches [1]. [1]: https://lkml.org/lkml/2016/1/31/52 Comments from Nilesh @ Oracle: There are applications which have a process model; and if you assume 10,000 processes attempting to mmap all the 6TB memory available on a server; we are looking at the following: processes : 10,000 memory : 6TB pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB pmd @ 2M page size: 120,000 / 512 = ~240GB pud @ 1G page size: 240GB / 512 = ~480MB As you can see with 2M pages, this system will use up an exorbitant amount of DRAM to hold the page tables; but the 1G pages finally brings it down to a reasonable level. Memory sizes will keep increasing; so this number will keep increasing. An argument can be made to convert the applications from process model to thread model, but in the real world that may not be always practical. Hopefully this helps explain the use case where this is valuable. This patch (of 3): In preparation for adding the ability to handle PUD pages, convert vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The vm_fault structure is extended to include a union of the different page table pointers that may be needed, and three flag bits are reserved to indicate which type of pointer is in the union. [ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()] Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com [dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path] Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox Signed-off-by: Dave Jiang Signed-off-by: Ross Zwisler Cc: Dave Hansen Cc: Vlastimil Babka Cc: Jan Kara Cc: Dan Williams Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/dax.c | 34 +++++++++++++--------------------- fs/dax.c | 45 ++++++++++++++++++++++++++++++++------------- fs/ext2/file.c | 2 +- fs/ext4/file.c | 23 +---------------------- fs/xfs/xfs_file.c | 10 +++++----- fs/xfs/xfs_trace.h | 2 +- include/linux/dax.h | 6 ------ include/linux/mm.h | 10 +++++++++- mm/memory.c | 18 ++++++++++++------ 9 files changed, 74 insertions(+), 76 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 0261f332bf3e..922ec461dcaa 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, return -1; } -static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { struct device *dev = &dax_dev->dev; struct dax_region *dax_region; @@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_NOPAGE; } -static int dax_dev_fault(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - int rc; - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, - current->comm, (vmf->flags & FAULT_FLAG_WRITE) - ? "write" : "read", vma->vm_start, vma->vm_end); - rcu_read_lock(); - rc = __dax_dev_fault(dax_dev, vmf); - rcu_read_unlock(); - - return rc; -} - static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_fault *vmf) +static int dax_dev_fault(struct vm_fault *vmf) { int rc; struct file *filp = vmf->vma->vm_file; @@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf) vmf->vma->vm_start, vmf->vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vmf); + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + rc = __dax_dev_pte_fault(dax_dev, vmf); + break; + case FAULT_FLAG_SIZE_PMD: + rc = __dax_dev_pmd_fault(dax_dev, vmf); + break; + default: + return VM_FAULT_FALLBACK; + } rcu_read_unlock(); return rc; @@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf) static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, - .pmd_fault = dax_dev_pmd_fault, + .huge_fault = dax_dev_fault, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/dax.c b/fs/dax.c index f955c0df33bb..c3c29fbf64be 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1118,16 +1118,8 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -/** - * dax_iomap_fault - handle a page fault on a DAX file - * @vmf: The description of the fault - * @ops: iomap ops passed from the file system - * - * When a page fault occurs, filesystems may call this helper in their fault - * or mkwrite handler for DAX files. Assumes the caller has done all the - * necessary locking for the page fault to proceed successfully. - */ -int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +static int dax_iomap_pte_fault(struct vm_fault *vmf, + const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) } return vmf_ret; } -EXPORT_SYMBOL_GPL(dax_iomap_fault); #ifdef CONFIG_FS_DAX_PMD /* @@ -1335,7 +1326,8 @@ fallback: return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +static int dax_iomap_pmd_fault(struct vm_fault *vmf, + const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; @@ -1443,5 +1435,32 @@ out: trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); return result; } -EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); +#else +static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops) +{ + return VM_FAULT_FALLBACK; +} #endif /* CONFIG_FS_DAX_PMD */ + +/** + * dax_iomap_fault - handle a page fault on a DAX file + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in + * their fault handler for DAX files. dax_iomap_fault() assumes the caller + * has done all the necessary locking for page fault to proceed + * successfully. + */ +int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +{ + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + return dax_iomap_pte_fault(vmf, ops); + case FAULT_FLAG_SIZE_PMD: + return dax_iomap_pmd_fault(vmf, ops); + default: + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(dax_iomap_fault); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0bf0d971205a..68738832beda 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* - * .pmd_fault is not supported for DAX because allocation in ext2 + * .huge_fault is not supported for DAX because allocation in ext2 * cannot be reliably aligned to huge page sizes and so pmd faults * will always fail and fail back to regular faults. */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21e1f17fe36d..502d2d07d191 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf) return result; } -static int -ext4_dax_pmd_fault(struct vm_fault *vmf) -{ - int result; - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; - - if (write) { - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) - sb_end_pagefault(sb); - - return result; -} - /* * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() * handler we check for races agaist truncate. Note that since we cycle through @@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, - .pmd_fault = ext4_dax_pmd_fault, + .huge_fault = ext4_dax_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9cc10136ba0b..990e03819370 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,12 +1429,12 @@ xfs_filemap_fault( /* * Similar to xfs_filemap_fault(), the DAX fault path can call into here on * both read and write faults. Hence we need to handle both cases. There is no - * ->pmd_mkwrite callout for huge pages, so we have a single function here to + * ->huge_mkwrite callout for huge pages, so we have a single function here to * handle both cases here. @flags carries the information on the type of fault * occuring. */ STATIC int -xfs_filemap_pmd_fault( +xfs_filemap_huge_fault( struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault( if (!IS_DAX(inode)) return VM_FAULT_FALLBACK; - trace_xfs_filemap_pmd_fault(ip); + trace_xfs_filemap_huge_fault(ip); if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); @@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (vmf->flags & FAULT_FLAG_WRITE) @@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite( static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, - .pmd_fault = xfs_filemap_pmd_fault, + .huge_fault = xfs_filemap_huge_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fb7555e73a62..383ac227ce2c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); +DEFINE_INODE_EVENT(xfs_filemap_huge_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); diff --git a/include/linux/dax.h b/include/linux/dax.h index eeb02421c848..cf9af225962b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } -static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, - const struct iomap_ops *ops) -{ - return VM_FAULT_FALLBACK; -} #endif int dax_pfn_mkwrite(struct vm_fault *vmf); diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dd80ba6568a..035a688e5472 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,6 +285,11 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ +#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ +#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ +#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ + #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ @@ -314,6 +319,9 @@ struct vm_fault { unsigned long address; /* Faulting virtual address */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ + pud_t *pud; /* Pointer to pud entry matching + * the 'address' + */ pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ @@ -351,7 +359,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); - int (*pmd_fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/mm/memory.c b/mm/memory.c index cf97d88158cd..e721e8eba570 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); return VM_FAULT_FALLBACK; } @@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); @@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; pud_t *pud; + int ret; pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&vmf); + vmf.flags |= FAULT_FLAG_SIZE_PMD; + ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + /* fall through path, remove PMD flag */ + vmf.flags &= ~FAULT_FLAG_SIZE_PMD; } else { pmd_t orig_pmd = *vmf.pmd; - int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { + vmf.flags |= FAULT_FLAG_SIZE_PMD; if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); @@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; + /* fall through path, remove PUD flag */ + vmf.flags &= ~FAULT_FLAG_SIZE_PUD; } else { huge_pmd_set_accessed(&vmf, orig_pmd); return 0; -- 2.20.1