mm: compaction: Abort async compaction if locks are contended or taking too long
authorMel Gorman <mgorman@suse.de>
Tue, 21 Aug 2012 23:16:17 +0000 (16:16 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 21 Aug 2012 23:45:03 +0000 (16:45 -0700)
Jim Schutt reported a problem that pointed at compaction contending
heavily on locks.  The workload is straight-forward and in his own words;

The systems in question have 24 SAS drives spread across 3 HBAs,
running 24 Ceph OSD instances, one per drive.  FWIW these servers
are dual-socket Intel 5675 Xeons w/48 GB memory.  I've got ~160
Ceph Linux clients doing dd simultaneously to a Ceph file system
backed by 12 of these servers.

Early in the test everything looks fine

  procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
   r  b       swpd       free       buff      cache   si   so    bi    bo   in   cs  us sy  id wa st
  31 15          0     287216        576   38606628    0    0     2  1158    2   14   1  3  95  0  0
  27 15          0     225288        576   38583384    0    0    18 2222016 203357 134876  11 56  17 15  0
  28 17          0     219256        576   38544736    0    0    11 2305932 203141 146296  11 49  23 17  0
   6 18          0     215596        576   38552872    0    0     7 2363207 215264 166502  12 45  22 20  0
  22 18          0     226984        576   38596404    0    0     3 2445741 223114 179527  12 43  23 22  0

and then it goes to pot

  procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
   r  b       swpd       free       buff      cache   si   so    bi    bo   in   cs  us sy  id wa st
  163  8          0     464308        576   36791368    0    0    11 22210  866  536   3 13  79  4  0
  207 14          0     917752        576   36181928    0    0   712 1345376 134598 47367   7 90   1  2  0
  123 12          0     685516        576   36296148    0    0   429 1386615 158494 60077   8 84   5  3  0
  123 12          0     598572        576   36333728    0    0  1107 1233281 147542 62351   7 84   5  4  0
  622  7          0     660768        576   36118264    0    0   557 1345548 151394 59353   7 85   4  3  0
  223 11          0     283960        576   36463868    0    0    46 1107160 121846 33006   6 93   1  1  0

Note that system CPU usage is very high blocks being written out has
dropped by 42%. He analysed this with perf and found

  perf record -g -a sleep 10
  perf report --sort symbol --call-graph fractal,5
    34.63%  [k] _raw_spin_lock_irqsave
            |
            |--97.30%-- isolate_freepages
            |          compaction_alloc
            |          unmap_and_move
            |          migrate_pages
            |          compact_zone
            |          compact_zone_order
            |          try_to_compact_pages
            |          __alloc_pages_direct_compact
            |          __alloc_pages_slowpath
            |          __alloc_pages_nodemask
            |          alloc_pages_vma
            |          do_huge_pmd_anonymous_page
            |          handle_mm_fault
            |          do_page_fault
            |          page_fault
            |          |
            |          |--87.39%-- skb_copy_datagram_iovec
            |          |          tcp_recvmsg
            |          |          inet_recvmsg
            |          |          sock_recvmsg
            |          |          sys_recvfrom
            |          |          system_call
            |          |          __recv
            |          |          |
            |          |           --100.00%-- (nil)
            |          |
            |           --12.61%-- memcpy
             --2.70%-- [...]

There was other data but primarily it is all showing that compaction is
contended heavily on the zone->lock and zone->lru_lock.

commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration] noted that it was possible for
migration to hold the lru_lock for an excessive amount of time. Very
broadly speaking this patch expands the concept.

This patch introduces compact_checklock_irqsave() to check if a lock
is contended or the process needs to be scheduled. If either condition
is true then async compaction is aborted and the caller is informed.
The page allocator will fail a THP allocation if compaction failed due
to contention. This patch also introduces compact_trylock_irqsave()
which will acquire the lock only if it is not contended and the process
does not need to schedule.

Reported-by: Jim Schutt <jaschut@sandia.gov>
Tested-by: Jim Schutt <jaschut@sandia.gov>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/compaction.h
mm/compaction.c
mm/internal.h
mm/page_alloc.c

index 133ddcf83397036a5c20ed6431db3b789eeab37a..ef658147e4e8391eec61aa7559a1452fcffe6a79 100644 (file)
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask,
-                       bool sync);
+                       bool sync, bool *contended);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync)
+                       bool sync, bool *contended)
 {
        return COMPACT_CONTINUE;
 }
index bcce7897e17adf8fe2164f8719310a6b5abb8b83..7fcd3a52e68d4b2a9bfc07c1056b7db3a2b154b1 100644 (file)
@@ -50,6 +50,47 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
 
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                     bool locked, struct compact_control *cc)
+{
+       if (need_resched() || spin_is_contended(lock)) {
+               if (locked) {
+                       spin_unlock_irqrestore(lock, *flags);
+                       locked = false;
+               }
+
+               /* async aborts if taking too long or contended */
+               if (!cc->sync) {
+                       if (cc->contended)
+                               *cc->contended = true;
+                       return false;
+               }
+
+               cond_resched();
+               if (fatal_signal_pending(current))
+                       return false;
+       }
+
+       if (!locked)
+               spin_lock_irqsave(lock, *flags);
+       return true;
+}
+
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                       unsigned long *flags, struct compact_control *cc)
+{
+       return compact_checklock_irqsave(lock, flags, false, cc);
+}
+
 /*
  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
 
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+       /* If locked we can use the interrupt unsafe versions */
+       if (locked) {
+               __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+               __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+       } else {
+               mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+               mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+       }
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        struct list_head *migratelist = &cc->migratepages;
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
+       unsigned long flags;
+       bool locked;
 
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
        /* Time to isolate some pages for migration */
        cond_resched();
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irqsave(&zone->lru_lock, flags);
+       locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
-               bool locked = true;
 
                /* give a chance to irqs before checking need_resched() */
                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                       spin_unlock_irq(&zone->lru_lock);
+                       spin_unlock_irqrestore(&zone->lru_lock, flags);
                        locked = false;
                }
-               if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                       if (locked)
-                               spin_unlock_irq(&zone->lru_lock);
-                       cond_resched();
-                       spin_lock_irq(&zone->lru_lock);
-                       if (fatal_signal_pending(current))
-                               break;
-               } else if (!locked)
-                       spin_lock_irq(&zone->lru_lock);
+
+               /* Check if it is ok to still hold the lock */
+               locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                                                               locked, cc);
+               if (!locked)
+                       break;
 
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                }
        }
 
-       acct_isolated(zone, cc);
+       acct_isolated(zone, locked, cc);
 
-       spin_unlock_irq(&zone->lru_lock);
+       if (locked)
+               spin_unlock_irqrestore(&zone->lru_lock, flags);
 
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
@@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
                 * are disabled
                 */
                isolated = 0;
-               spin_lock_irqsave(&zone->lock, flags);
+
+               /*
+                * The zone lock must be held to isolate freepages. This
+                * unfortunately this is a very coarse lock and can be
+                * heavily contended if there are parallel allocations
+                * or parallel compactions. For async compaction do not
+                * spin on the lock
+                */
+               if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+                       break;
                if (suitable_migration_target(page)) {
                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
                        isolated = isolate_freepages_block(pfn, end_pfn,
@@ -773,7 +829,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                bool sync)
+                                bool sync, bool *contended)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+               .contended = contended,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync)
+                       bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
 
-               status = compact_zone_order(zone, order, gfp_mask, sync);
+               status = compact_zone_order(zone, order, gfp_mask, sync,
+                                               contended);
                rc = max(status, rc);
 
                /* If a normal allocation would succeed, stop compacting */
index 3314f79d775a5f90c84e3250d8b93b7bd6f99933..b8c91b342e244153ec9b24b4673e6dd2575af267 100644 (file)
@@ -130,6 +130,7 @@ struct compact_control {
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+       bool *contended;                /* True if a lock was contended */
 };
 
 unsigned long
index 07f19248acb5fc72ca85a9c88b74920346828118..c66fb875104ab99feccb0e116b48b9d54a6011e4 100644 (file)
@@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-       bool *deferred_compaction,
+       bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        struct page *page;
@@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                               nodemask, sync_migration);
+                                               nodemask, sync_migration,
+                                               contended_compaction);
        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
 
@@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-       bool *deferred_compaction,
+       bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        return NULL;
@@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        bool sync_migration = false;
        bool deferred_compaction = false;
+       bool contended_compaction = false;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2425,6 +2427,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                       &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
@@ -2434,10 +2437,11 @@ rebalance:
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
-        * has requested the system not be heavily disrupted, fail the
-        * allocation now instead of entering direct reclaim
+        * requested a movable allocation that does not heavily disrupt the
+        * system then fail the allocation instead of entering direct reclaim.
         */
-       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+       if ((deferred_compaction || contended_compaction) &&
+                                               (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
 
        /* Try direct reclaim and then allocating */
@@ -2508,6 +2512,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                       &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)