mm: munlock: batch non-THP page isolation and munlock+putback using pagevec
authorVlastimil Babka <vbabka@suse.cz>
Wed, 11 Sep 2013 21:22:29 +0000 (14:22 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:58:00 +0000 (15:58 -0700)
Currently, munlock_vma_range() calls munlock_vma_page on each page in a
loop, which results in repeated taking and releasing of the lru_lock
spinlock for isolating pages one by one.  This patch batches the munlock
operations using an on-stack pagevec, so that isolation is done under
single lru_lock.  For THP pages, the old behavior is preserved as they
might be split while putting them into the pagevec.  After this patch, a
9% speedup was measured for munlocking a 56GB large memory area with THP
disabled.

A new function __munlock_pagevec() is introduced that takes a pagevec and:
1) It clears PageMlocked and isolates all pages under lru_lock.  Zone page
stats can be also updated using the variant which assumes disabled
interrupts.  2) It finishes the munlock and lru putback on all pages under
their lock_page.  Note that previously, lock_page covered also the
PageMlocked clearing and page isolation, but it is not needed for those
operations.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/mlock.c

index b85f1e82761043e7f3e35948fca0fdd2f206ff5e..b3b4a78b7802282e94daf049f437bba5993a736c 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
@@ -18,6 +19,8 @@
 #include <linux/rmap.h>
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
 
 #include "internal.h"
 
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
        }
 }
 
+/*
+ * Finish munlock after successful page isolation
+ *
+ * Page must be locked. This is a wrapper for try_to_munlock()
+ * and putback_lru_page() with munlock accounting.
+ */
+static void __munlock_isolated_page(struct page *page)
+{
+       int ret = SWAP_AGAIN;
+
+       /*
+        * Optimization: if the page was mapped just once, that's our mapping
+        * and we don't need to check all the other vmas.
+        */
+       if (page_mapcount(page) > 1)
+               ret = try_to_munlock(page);
+
+       /* Did try_to_unlock() succeed or punt? */
+       if (ret != SWAP_MLOCK)
+               count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+       putback_lru_page(page);
+}
+
+/*
+ * Accounting for page isolation fail during munlock
+ *
+ * Performs accounting when page isolation fails in munlock. There is nothing
+ * else to do because it means some other task has already removed the page
+ * from the LRU. putback_lru_page() will take care of removing the page from
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
+ * the page back to the unevictable list if some other vma has it mlocked.
+ */
+static void __munlock_isolation_failed(struct page *page)
+{
+       if (PageUnevictable(page))
+               count_vm_event(UNEVICTABLE_PGSTRANDED);
+       else
+               count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+}
+
 /**
  * munlock_vma_page - munlock a vma page
  * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
                unsigned int nr_pages = hpage_nr_pages(page);
                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                page_mask = nr_pages - 1;
-               if (!isolate_lru_page(page)) {
-                       int ret = SWAP_AGAIN;
-
-                       /*
-                        * Optimization: if the page was mapped just once,
-                        * that's our mapping and we don't need to check all the
-                        * other vmas.
-                        */
-                       if (page_mapcount(page) > 1)
-                               ret = try_to_munlock(page);
-                       /*
-                        * did try_to_unlock() succeed or punt?
-                        */
-                       if (ret != SWAP_MLOCK)
-                               count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-
-                       putback_lru_page(page);
-               } else {
-                       /*
-                        * Some other task has removed the page from the LRU.
-                        * putback_lru_page() will take care of removing the
-                        * page from the unevictable list, if necessary.
-                        * vmscan [page_referenced()] will move the page back
-                        * to the unevictable list if some other vma has it
-                        * mlocked.
-                        */
-                       if (PageUnevictable(page))
-                               count_vm_event(UNEVICTABLE_PGSTRANDED);
-                       else
-                               count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-               }
+               if (!isolate_lru_page(page))
+                       __munlock_isolated_page(page);
+               else
+                       __munlock_isolation_failed(page);
        }
 
        return page_mask;
@@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval)
        return retval;
 }
 
+/*
+ * Munlock a batch of pages from the same zone
+ *
+ * The work is split to two main phases. First phase clears the Mlocked flag
+ * and attempts to isolate the pages, all under a single zone lru lock.
+ * The second phase finishes the munlock only for pages where isolation
+ * succeeded.
+ *
+ * Note that pvec is modified during the process. Before returning
+ * pagevec_reinit() is called on it.
+ */
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+{
+       int i;
+       int nr = pagevec_count(pvec);
+
+       /* Phase 1: page isolation */
+       spin_lock_irq(&zone->lru_lock);
+       for (i = 0; i < nr; i++) {
+               struct page *page = pvec->pages[i];
+
+               if (TestClearPageMlocked(page)) {
+                       struct lruvec *lruvec;
+                       int lru;
+
+                       /* we have disabled interrupts */
+                       __mod_zone_page_state(zone, NR_MLOCK, -1);
+
+                       if (PageLRU(page)) {
+                               lruvec = mem_cgroup_page_lruvec(page, zone);
+                               lru = page_lru(page);
+
+                               get_page(page);
+                               ClearPageLRU(page);
+                               del_page_from_lru_list(page, lruvec, lru);
+                       } else {
+                               __munlock_isolation_failed(page);
+                               goto skip_munlock;
+                       }
+
+               } else {
+skip_munlock:
+                       /*
+                        * We won't be munlocking this page in the next phase
+                        * but we still need to release the follow_page_mask()
+                        * pin.
+                        */
+                       pvec->pages[i] = NULL;
+                       put_page(page);
+               }
+       }
+       spin_unlock_irq(&zone->lru_lock);
+
+       /* Phase 2: page munlock and putback */
+       for (i = 0; i < nr; i++) {
+               struct page *page = pvec->pages[i];
+
+               if (page) {
+                       lock_page(page);
+                       __munlock_isolated_page(page);
+                       unlock_page(page);
+                       put_page(page); /* pin from follow_page_mask() */
+               }
+       }
+       pagevec_reinit(pvec);
+}
+
 /*
  * munlock_vma_pages_range() - munlock all pages in the vma range.'
  * @vma - vma containing range to be munlock()ed.
@@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval)
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
+       struct pagevec pvec;
+       struct zone *zone = NULL;
+
+       pagevec_init(&pvec, 0);
        vma->vm_flags &= ~VM_LOCKED;
 
        while (start < end) {
                struct page *page;
                unsigned int page_mask, page_increm;
+               struct zone *pagezone;
 
                /*
                 * Although FOLL_DUMP is intended for get_dump_page(),
@@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
                                        &page_mask);
                if (page && !IS_ERR(page)) {
-                       lock_page(page);
-                       /*
-                        * Any THP page found by follow_page_mask() may have
-                        * gotten split before reaching munlock_vma_page(),
-                        * so we need to recompute the page_mask here.
-                        */
-                       page_mask = munlock_vma_page(page);
-                       unlock_page(page);
-                       put_page(page);
+                       pagezone = page_zone(page);
+                       /* The whole pagevec must be in the same zone */
+                       if (pagezone != zone) {
+                               if (pagevec_count(&pvec))
+                                       __munlock_pagevec(&pvec, zone);
+                               zone = pagezone;
+                       }
+                       if (PageTransHuge(page)) {
+                               /*
+                                * THP pages are not handled by pagevec due
+                                * to their possible split (see below).
+                                */
+                               if (pagevec_count(&pvec))
+                                       __munlock_pagevec(&pvec, zone);
+                               lock_page(page);
+                               /*
+                                * Any THP page found by follow_page_mask() may
+                                * have gotten split before reaching
+                                * munlock_vma_page(), so we need to recompute
+                                * the page_mask here.
+                                */
+                               page_mask = munlock_vma_page(page);
+                               unlock_page(page);
+                               put_page(page); /* follow_page_mask() */
+                       } else {
+                               /*
+                                * Non-huge pages are handled in batches
+                                * via pagevec. The pin from
+                                * follow_page_mask() prevents them from
+                                * collapsing by THP.
+                                */
+                               if (pagevec_add(&pvec, page) == 0)
+                                       __munlock_pagevec(&pvec, zone);
+                       }
                }
                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
                start += page_increm * PAGE_SIZE;
                cond_resched();
        }
+       if (pagevec_count(&pvec))
+               __munlock_pagevec(&pvec, zone);
 }
 
 /*