mm/swap: allocate swap slots in batches
authorTim Chen <tim.c.chen@linux.intel.com>
Wed, 22 Feb 2017 23:45:33 +0000 (15:45 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 00:41:30 +0000 (16:41 -0800)
Currently, the swap slots are allocated one page at a time, causing
contention to the swap_info lock protecting the swap partition on every
page being swapped.

This patch adds new functions get_swap_pages and scan_swap_map_slots to
request multiple swap slots at once.  This will reduces the lock
contention on the swap_info lock.  Also scan_swap_map_slots can operate
more efficiently as swap slots often occurs in clusters close to each
other on a swap device and it is quicker to allocate them together.

Link: http://lkml.kernel.org/r/9fec2845544371f62c3763d43510045e33d286a6.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/swap.h
mm/swapfile.c

index 3116382067cd8b8bd2f754c62d2276d3c943a415..956eae8a8edf9769571b3f6723e273d3d82572e7 100644 (file)
@@ -27,6 +27,7 @@ struct bio;
 #define SWAP_FLAGS_VALID       (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
+#define SWAP_BATCH 64
 
 static inline int current_is_kswapd(void)
 {
@@ -386,6 +387,7 @@ static inline long get_nr_swap_pages(void)
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
index 7e888de35c41a62ade94ef7401d0c07b47943a32..e73b5441055b3f24b89f2e36fbf0e6171b81c940 100644 (file)
@@ -496,7 +496,7 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  * might involve allocating a new cluster for current CPU too.
  */
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
        unsigned long *offset, unsigned long *scan_base)
 {
        struct percpu_cluster *cluster;
@@ -520,7 +520,7 @@ new_cluster:
                        *scan_base = *offset = si->cluster_next;
                        goto new_cluster;
                } else
-                       return;
+                       return false;
        }
 
        found_free = false;
@@ -552,16 +552,22 @@ new_cluster:
        cluster->next = tmp + 1;
        *offset = tmp;
        *scan_base = tmp;
+       return found_free;
 }
 
-static unsigned long scan_swap_map(struct swap_info_struct *si,
-                                  unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+                              unsigned char usage, int nr,
+                              swp_entry_t slots[])
 {
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+       int n_ret = 0;
+
+       if (nr > SWAP_BATCH)
+               nr = SWAP_BATCH;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -579,8 +585,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
        /* SSD algorithm */
        if (si->cluster_info) {
-               scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
-               goto checks;
+               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+                       goto checks;
+               else
+                       goto scan;
        }
 
        if (unlikely(!si->cluster_nr--)) {
@@ -624,8 +632,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 checks:
        if (si->cluster_info) {
-               while (scan_swap_map_ssd_cluster_conflict(si, offset))
-                       scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+               while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+               /* take a break if we already got some slots */
+                       if (n_ret)
+                               goto done;
+                       if (!scan_swap_map_try_ssd_cluster(si, &offset,
+                                                       &scan_base))
+                               goto scan;
+               }
        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
@@ -650,7 +664,10 @@ checks:
 
        if (si->swap_map[offset]) {
                unlock_cluster(ci);
-               goto scan;
+               if (!n_ret)
+                       goto scan;
+               else
+                       goto done;
        }
 
        if (offset == si->lowest_bit)
@@ -669,9 +686,43 @@ checks:
        inc_cluster_info_page(si, si->cluster_info, offset);
        unlock_cluster(ci);
        si->cluster_next = offset + 1;
-       si->flags -= SWP_SCANNING;
+       slots[n_ret++] = swp_entry(si->type, offset);
+
+       /* got enough slots or reach max slots? */
+       if ((n_ret == nr) || (offset >= si->highest_bit))
+               goto done;
+
+       /* search for next available slot */
+
+       /* time to take a break? */
+       if (unlikely(--latency_ration < 0)) {
+               if (n_ret)
+                       goto done;
+               spin_unlock(&si->lock);
+               cond_resched();
+               spin_lock(&si->lock);
+               latency_ration = LATENCY_LIMIT;
+       }
 
-       return offset;
+       /* try to get more slots in cluster */
+       if (si->cluster_info) {
+               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+                       goto checks;
+               else
+                       goto done;
+       }
+       /* non-ssd case */
+       ++offset;
+
+       /* non-ssd case, still more slots in cluster? */
+       if (si->cluster_nr && !si->swap_map[offset]) {
+               --si->cluster_nr;
+               goto checks;
+       }
+
+done:
+       si->flags -= SWP_SCANNING;
+       return n_ret;
 
 scan:
        spin_unlock(&si->lock);
@@ -709,17 +760,41 @@ scan:
 
 no_page:
        si->flags -= SWP_SCANNING;
-       return 0;
+       return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+                                  unsigned char usage)
+{
+       swp_entry_t entry;
+       int n_ret;
+
+       n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+       if (n_ret)
+               return swp_offset(entry);
+       else
+               return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
 {
        struct swap_info_struct *si, *next;
-       pgoff_t offset;
+       long avail_pgs;
+       int n_ret = 0;
 
-       if (atomic_long_read(&nr_swap_pages) <= 0)
+       avail_pgs = atomic_long_read(&nr_swap_pages);
+       if (avail_pgs <= 0)
                goto noswap;
-       atomic_long_dec(&nr_swap_pages);
+
+       if (n_goal > SWAP_BATCH)
+               n_goal = SWAP_BATCH;
+
+       if (n_goal > avail_pgs)
+               n_goal = avail_pgs;
+
+       atomic_long_sub(n_goal, &nr_swap_pages);
 
        spin_lock(&swap_avail_lock);
 
@@ -745,14 +820,14 @@ start_over:
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-
-               /* This is called for allocating swap entry for cache */
-               offset = scan_swap_map(si, SWAP_HAS_CACHE);
+               n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+                                           n_goal, swp_entries);
                spin_unlock(&si->lock);
-               if (offset)
-                       return swp_entry(si->type, offset);
+               if (n_ret)
+                       goto check_out;
                pr_debug("scan_swap_map of si %d failed to find offset\n",
-                      si->type);
+                       si->type);
+
                spin_lock(&swap_avail_lock);
 nextsi:
                /*
@@ -763,7 +838,8 @@ nextsi:
                 * up between us dropping swap_avail_lock and taking si->lock.
                 * Since we dropped the swap_avail_lock, the swap_avail_head
                 * list may have been modified; so if next is still in the
-                * swap_avail_head list then try it, otherwise start over.
+                * swap_avail_head list then try it, otherwise start over
+                * if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_list))
                        goto start_over;
@@ -771,9 +847,19 @@ nextsi:
 
        spin_unlock(&swap_avail_lock);
 
-       atomic_long_inc(&nr_swap_pages);
+check_out:
+       if (n_ret < n_goal)
+               atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
 noswap:
-       return (swp_entry_t) {0};
+       return n_ret;
+}
+
+swp_entry_t get_swap_page(void)
+{
+       swp_entry_t entry;
+
+       get_swap_pages(1, &entry);
+       return entry;
 }
 
 /* The only caller of this function is now suspend routine */