#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
*/
if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
+ cond_resched();
}
}
}
}
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (addr & ~(huge_page_mask(hstate_vma(vma))))
+ return -EINVAL;
+ return 0;
+}
+
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
+ .split = hugetlb_vm_op_split,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
- pte_t *src_pte, *dst_pte, entry;
+ pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
break;
}
- /* If the pagetables are shared don't copy or take references */
- if (dst_pte == src_pte)
+ /*
+ * If the pagetables are shared don't copy or take references.
+ * dst_pte == src_pte is the common case of src/dest sharing.
+ *
+ * However, src could have 'unshared' and dst shares with
+ * another vma. If dst_pte !none, this implies sharing.
+ * Check here before taking page table lock, and once again
+ * after taking the lock below.
+ */
+ dst_entry = huge_ptep_get(dst_pte);
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- if (huge_pte_none(entry)) { /* skip none entry */
+ dst_entry = huge_ptep_get(dst_pte);
+ if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ /*
+ * Skip if src entry none. Also, skip in the
+ * unlikely case dst entry !none as this implies
+ * sharing with another vma.
+ */
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
return err;
ClearPagePrivate(page);
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
unsigned long src_addr,
struct page **pagep)
{
+ struct address_space *mapping;
+ pgoff_t idx;
+ unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
+
+ mapping = dst_vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
/*
* If shared, add to page cache
*/
if (vm_shared) {
- struct address_space *mapping = dst_vma->vm_file->f_mapping;
- pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ ret = -EFAULT;
+ if (idx >= size)
+ goto out_release_nounlock;
+ /*
+ * Serialization between remove_inode_hugepages() and
+ * huge_add_to_page_cache() below happens through the
+ * hugetlb_fault_mutex_table that here must be hold by
+ * the caller.
+ */
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
spin_lock(ptl);
+ /*
+ * Recheck the i_size after holding PT lock to make sure not
+ * to leave any page mapped (as page_mapped()) beyond the end
+ * of the i_size (remove_inode_hugepages() is strict about
+ * enforcing that). If we bail out here, we'll also leave a
+ * page in the radix tree in the vm_shared case beyond the end
+ * of the i_size, but remove_inode_hugepages() will take care
+ * of it as soon as we drop the hugetlb_fault_mutex_table.
+ */
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ ret = -EFAULT;
+ if (idx >= size)
+ goto out_release_unlock;
+
ret = -EEXIST;
if (!huge_pte_none(huge_ptep_get(dst_pte)))
goto out_release_unlock;
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
struct resv_map *resv_map;
long gbl_reserve;
+ /* This should never happen */
+ if (from > to) {
+ VM_WARN(1, "%s called with a negative range\n", __func__);
+ return -EINVAL;
+ }
+
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
+ if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
+/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long check_addr = *start;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+ unsigned long a_start = check_addr & PUD_MASK;
+ unsigned long a_end = a_start + PUD_SIZE;
+
+ /*
+ * If sharing is possible, adjust start/end if necessary.
+ */
+ if (range_in_vma(vma, a_start, a_end)) {
+ if (a_start < *start)
+ *start = a_start;
+ if (a_end > *end)
+ *end = a_end;
+ }
+ }
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
{
return 0;
}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
- p4d = p4d_offset(pgd, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {