From 1998cc048901109a29924380b8e91bc049b32951 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:32:31 -0800 Subject: [PATCH] mm: make madvise(MADV_WILLNEED) support swap file prefetch Make madvise(MADV_WILLNEED) support swap file prefetch. If memory is swapout, this syscall can do swapin prefetch. It has no impact if the memory isn't swapout. [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] [sasha.levin@oracle.com: fix BUG on madvise early failure] Signed-off-by: Shaohua Li Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 03dfa5c7adb3..c58c94b56c3d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -131,6 +134,84 @@ out: return error; } +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + pte_t *orig_pte; + struct vm_area_struct *vma = walk->private; + unsigned long index; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + pte = *(orig_pte + ((index - start) / PAGE_SIZE)); + pte_unmap_unlock(orig_pte, ptl); + + if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index); + if (page) + page_cache_release(page); + } + + return 0; +} + +static void force_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_walk walk = { + .mm = vma->vm_mm, + .pmd_entry = swapin_walk_pmd_entry, + .private = vma, + }; + + walk_page_range(start, end, &walk); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + pgoff_t index; + struct page *page; + swp_entry_t swap; + + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + page = find_get_page(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); + continue; + } + swap = radix_to_swp_entry(page); + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0); + if (page) + page_cache_release(page); + } + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + /* * Schedule all required I/O operations. Do not wait for completion. */ @@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma, { struct file *file = vma->vm_file; +#ifdef CONFIG_SWAP + if (!file || mapping_cap_swap_backed(file->f_mapping)) { + *prev = vma; + if (!file) + force_swapin_readahead(vma, start, end); + else + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#endif + if (!file) return -EBADF; @@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int error = -EINVAL; int write; size_t len; + struct blk_plug plug; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (vma && start > vma->vm_start) prev = vma; + blk_start_plug(&plug); for (;;) { /* Still start < end. */ error = -ENOMEM; if (!vma) - goto out; + goto out_plug; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) - goto out; + goto out_plug; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ @@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = madvise_vma(vma, &prev, start, tmp, behavior); if (error) - goto out; + goto out_plug; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) - goto out; + goto out_plug; if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_sem */ vma = find_vma(current->mm, start); } +out_plug: + blk_finish_plug(&plug); out: if (write) up_write(¤t->mm->mmap_sem); -- 2.20.1