static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_HZP;
+ return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
for (;;) {
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
+ if (!entry ||
+ WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
}
static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+ pgoff_t index)
{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
+ dax_unlock_mapping_entry(mapping, index);
}
/*
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- if (!radix_tree_exceptional_entry(entry))
+ if (!entry)
return;
/* We have to wake up next waiter for the radix tree entry lock */
}
/*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
*
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+ entry = ERR_PTR(-EIO);
+ goto out_unlock;
+ }
+
if (entry) {
if (size_flag & RADIX_DAX_PMD) {
- if (!radix_tree_exceptional_entry(entry) ||
- dax_is_pte_entry(entry)) {
+ if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
entry = ERR_PTR(-EEXIST);
goto out_unlock;
}
} else { /* trying to grab a PTE entry */
- if (radix_tree_exceptional_entry(entry) &&
- dax_is_pmd_entry(entry) &&
+ if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
if (err) {
if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
spin_lock_irq(&mapping->tree_lock);
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
- /* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(entry)) {
- struct page *page = entry;
-
- get_page(page);
- spin_unlock_irq(&mapping->tree_lock);
- lock_page(page);
- /* Page got truncated? Retry... */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto restart;
- }
- return page;
- }
entry = lock_slot(mapping, slot);
out_unlock:
spin_unlock_irq(&mapping->tree_lock);
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
+ if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int error = 0;
- bool hole_fill = false;
void *new_entry;
pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- /* Replacing hole page with block mapping? */
- if (!radix_tree_exceptional_entry(entry)) {
- hole_fill = true;
- /*
- * Unmap the page now before we remove it from page cache below.
- * The page is locked so it cannot be faulted in again.
- */
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_SIZE, 0);
- error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
- if (error)
- return ERR_PTR(error);
- } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
- /* replacing huge zero page with PMD block mapping */
- unmap_mapping_range(mapping,
- (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+ if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ /* we are replacing a zero page with block mapping */
+ if (dax_is_pmd_entry(entry))
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+ PMD_SIZE, 0);
+ else /* pte entry */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
new_entry = dax_radix_locked_entry(sector, flags);
- if (hole_fill) {
- __delete_from_page_cache(entry, NULL);
- /* Drop pagecache reference */
- put_page(entry);
- error = __radix_tree_insert(page_tree, index,
- dax_radix_order(new_entry), new_entry);
- if (error) {
- new_entry = ERR_PTR(error);
- goto unlock;
- }
- mapping->nrexceptional++;
- } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the radix tree if the current
* entry is a zero page or an empty entry. If a normal PTE or
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
new_entry, NULL, NULL);
+ entry = new_entry;
}
+
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
spin_unlock_irq(&mapping->tree_lock);
- if (hole_fill) {
- radix_tree_preload_end();
- /*
- * We don't need hole page anymore, it has been replaced with
- * locked radix tree entry now.
- */
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(entry);
- unlock_page(entry);
- put_page(entry);
- }
- return new_entry;
+ return entry;
}
static inline unsigned long
spin_lock_irq(&mapping->tree_lock);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
- if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void **entryp,
+ sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- void *entry = *entryp;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
- *entryp = ret;
trace_dax_insert_mapping(mapping->host, vmf, ret);
- return vm_insert_mixed(vma, vaddr, pfn);
-}
-
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
- */
-int dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry)) {
- if (entry)
- put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
- trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
- }
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- /*
- * If we race with somebody updating the PTE and finish_mkwrite_fault()
- * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
- * the fault in either case.
- */
- finish_mkwrite_fault(vmf);
- put_locked_mapping_entry(mapping, index, entry);
- trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ return vm_insert_mixed(vma, vaddr, pfn);
}
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
/*
- * The user has performed a load from a hole in the file. Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files. We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
+ * The user has performed a load from a hole in the file. Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files. Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
*/
-static int dax_load_hole(struct address_space *mapping, void **entry,
+static int dax_load_hole(struct address_space *mapping, void *entry,
struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
- struct page *page;
- int ret;
-
- /* Hole page already exists? Return it... */
- if (!radix_tree_exceptional_entry(*entry)) {
- page = *entry;
- goto finish_fault;
- }
+ unsigned long vaddr = vmf->address;
+ int ret = VM_FAULT_NOPAGE;
+ struct page *zero_page;
+ void *entry2;
- /* This will replace locked radix tree entry with a hole page */
- page = find_or_create_page(mapping, vmf->pgoff,
- vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
+ zero_page = ZERO_PAGE(0);
+ if (unlikely(!zero_page)) {
ret = VM_FAULT_OOM;
goto out;
}
-finish_fault:
- vmf->page = page;
- ret = finish_fault(vmf);
- vmf->page = NULL;
- *entry = page;
- if (!ret) {
- /* Grab reference for PTE that is now referencing the page */
- get_page(page);
- ret = VM_FAULT_NOPAGE;
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_ZERO_PAGE);
+ if (IS_ERR(entry2)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
}
+
+ vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+ sector, PAGE_SIZE, entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
- vmf_ret = dax_load_hole(mapping, &entry, vmf);
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ put_locked_mapping_entry(mapping, vmf->pgoff);
out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void **entryp)
+ loff_t pos, void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
goto unlock_fallback;
dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
}
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void **entryp)
+ void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
if (unlikely(!zero_page))
goto fallback;
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
- RADIX_DAX_PMD | RADIX_DAX_HZP);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
+ * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+ * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
+ * is already in the tree, for instance), it will return -EEXIST and
+ * we just fall back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, &entry);
+ result = dax_pmd_load_hole(vmf, &iomap, entry);
break;
default:
WARN_ON_ONCE(1);
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
+ put_locked_mapping_entry(mapping, pgoff);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);