vmscan: limit direct reclaim for higher order allocations

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index b7719ec10dc5a998a102548cb38e9223c6e6be3c..7e0f0579738889c092d00834519cabe0b552a527 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
  
         /* Which cgroup do we reclaim from */
         struct mem_cgroup *mem_cgroup;
-       struct memcg_scanrecord *memcg_record;
  
         /*
          * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                         return PAGE_ACTIVATE;
                 }
  
-               /*
-                * Wait on writeback if requested to. This happens when
-                * direct reclaiming a large contiguous area and the
-                * first attempt to free a range of pages fails.
-                */
-               if (PageWriteback(page) &&
-                   (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                       wait_on_page_writeback(page);
-
                 if (!PageWriteback(page)) {
                         /* synchronous write or broken a_ops? */
                         ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
                 lru = LRU_UNEVICTABLE;
                 add_page_to_unevictable_list(page);
                 /*
-                * When racing with an mlock clearing (page is
-                * unlocked), make sure that if the other thread does
-                * not observe our setting of PG_lru and fails
-                * isolation, we see PG_mlocked cleared below and move
+                * When racing with an mlock or AS_UNEVICTABLE clearing
+                * (page is unlocked) make sure that if the other thread
+                * does not observe our setting of PG_lru and fails
+                * isolation/check_move_unevictable_page,
+                * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                  * the page back to the evictable list.
                  *
-                * The other side is TestClearPageMlocked().
+                * The other side is TestClearPageMlocked() or shmem_lock().
                  */
                 smp_mb();
         }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
                                       struct zone *zone,
-                                     struct scan_control *sc)
+                                     struct scan_control *sc,
+                                     int priority,
+                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_writeback)
  {
         LIST_HEAD(ret_pages);
         LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
         unsigned long nr_dirty = 0;
         unsigned long nr_congested = 0;
         unsigned long nr_reclaimed = 0;
+       unsigned long nr_writeback = 0;
  
         cond_resched();
  
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
                 if (PageWriteback(page)) {
+                       nr_writeback++;
                         /*
-                        * Synchronous reclaim is performed in two passes,
-                        * first an asynchronous pass over the list to
-                        * start parallel writeback, and a second synchronous
-                        * pass to wait for the IO to complete.  Wait here
-                        * for any page for which writeback has already
-                        * started.
+                        * Synchronous reclaim cannot queue pages for
+                        * writeback due to the possibility of stack overflow
+                        * but if it encounters a page under writeback, wait
+                        * for the IO to complete.
                          */
                         if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                             may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 if (PageDirty(page)) {
                         nr_dirty++;
  
+                       /*
+                        * Only kswapd can writeback filesystem pages to
+                        * avoid risk of stack overflow but do not writeback
+                        * unless under significant pressure.
+                        */
+                       if (page_is_file_cache(page) &&
+                                       (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                               /*
+                                * Immediately reclaim when written back.
+                                * Similar in principal to deactivate_page()
+                                * except we already have the page isolated
+                                * and know it's dirty
+                                */
+                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               SetPageReclaim(page);
+
+                               goto keep_locked;
+                       }
+
                         if (references == PAGEREF_RECLAIM_CLEAN)
                                 goto keep_locked;
                         if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
  
         list_splice(&ret_pages, page_list);
         count_vm_events(PGACTIVATE, pgactivate);
+       *ret_nr_dirty += nr_dirty;
+       *ret_nr_writeback += nr_writeback;
         return nr_reclaimed;
  }
  
@@ -1013,23 +1028,27 @@ keep_lumpy:
   *
   * returns 0 on success, -ve errno on failure.
   */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  {
+       bool all_lru_mode;
         int ret = -EINVAL;
  
         /* Only take pages on the LRU. */
         if (!PageLRU(page))
                 return ret;
  
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
         /*
          * When checking the active state, we need to be sure we are
          * dealing with comparible boolean values.  Take the logical not
          * of each.
          */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                 return ret;
  
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                 return ret;
  
         /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  
         ret = -EBUSY;
  
+       if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+               return ret;
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
         if (likely(get_page_unless_zero(page))) {
                 /*
                  * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               unsigned long *scanned, int order, isolate_mode_t mode,
+               int file)
  {
         unsigned long nr_taken = 0;
         unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  static unsigned long isolate_pages_global(unsigned long nr,
                                         struct list_head *dst,
                                         unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
+                                       isolate_mode_t mode,
+                                       struct zone *z, int active, int file)
  {
         int lru = LRU_BASE;
         if (active)
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                         int file = is_file_lru(lru);
                         int numpages = hpage_nr_pages(page);
                         reclaim_stat->recent_rotated[file] += numpages;
-                       if (!scanning_global_lru(sc))
-                               sc->memcg_record->nr_rotated[file] += numpages;
                 }
                 if (!pagevec_add(&pvec, page)) {
                         spin_unlock_irq(&zone->lru_lock);
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
  
         reclaim_stat->recent_scanned[0] += *nr_anon;
         reclaim_stat->recent_scanned[1] += *nr_file;
-       if (!scanning_global_lru(sc)) {
-               sc->memcg_record->nr_scanned[0] += *nr_anon;
-               sc->memcg_record->nr_scanned[1] += *nr_file;
-       }
  }
  
  /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
   *
   * If we are direct reclaiming for contiguous pages and we do not reclaim
   * everything in the list, try again and wait for writeback IO to complete.
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
         if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                 return false;
  
-       /* If we have relaimed everything on the isolated list, no stall */
+       /* If we have reclaimed everything on the isolated list, no stall */
         if (nr_freed == nr_taken)
                 return false;
  
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         unsigned long nr_taken;
         unsigned long nr_anon;
         unsigned long nr_file;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_writeback = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
  
         while (unlikely(too_many_isolated(zone, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         }
  
         set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               reclaim_mode |= ISOLATE_ACTIVE;
+
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
  
         if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
                 zone->pages_scanned += nr_scanned;
                 if (current_is_kswapd())
                         __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                         __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                                nr_scanned);
         } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone,
+                       sc->mem_cgroup, 0, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
                  * scanned pages on its own.
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         spin_unlock_irq(&zone->lru_lock);
  
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                               &nr_dirty, &nr_writeback);
  
         /* Check if we should syncronously wait for writeback */
         if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                 set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                       priority, &nr_dirty, &nr_writeback);
         }
  
-       if (!scanning_global_lru(sc))
-               sc->memcg_record->nr_freed[file] += nr_reclaimed;
-
         local_irq_disable();
         if (current_is_kswapd())
                 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
  
+       /*
+        * If reclaim is isolating dirty pages under writeback, it implies
+        * that the long-lived page allocation rate is exceeding the page
+        * laundering rate. Either the global limits are not being effective
+        * at throttling processes due to the page distribution throughout
+        * zones or there is heavy usage of a slow backing device. The
+        * only option is to throttle from reclaim context which is not ideal
+        * as there is no guarantee the dirtying process is throttled in the
+        * same way balance_dirty_pages() manages.
+        *
+        * This scales the number of dirty pages that must be under writeback
+        * before throttling depending on priority. It is a simple backoff
+        * function that has the most effect in the range DEF_PRIORITY to
+        * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+        * in trouble and reclaim is considered to be in trouble.
+        *
+        * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+        * DEF_PRIORITY-1  50% must be PageWriteback
+        * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+        * ...
+        * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+        *                     isolated page is PageWriteback
+        */
+       if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                 zone_idx(zone),
                 nr_scanned, nr_reclaimed,
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         struct page *page;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
         unsigned long nr_rotated = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
  
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
         if (scanning_global_lru(sc)) {
                 nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 1, file);
                 zone->pages_scanned += pgscanned;
         } else {
                 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 sc->mem_cgroup, 1, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         }
  
         reclaim_stat->recent_scanned[file] += nr_taken;
-       if (!scanning_global_lru(sc))
-               sc->memcg_record->nr_scanned[file] += nr_taken;
  
         __count_zone_vm_events(PGREFILL, zone, pgscanned);
         if (file)
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
          * get_scan_ratio.
          */
         reclaim_stat->recent_rotated[file] += nr_rotated;
-       if (!scanning_global_lru(sc))
-               sc->memcg_record->nr_rotated[file] += nr_rotated;
  
         move_active_pages_to_lru(zone, &l_active,
                                                 LRU_ACTIVE + file * LRU_FILE);
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         u64 fraction[2], denominator;
         enum lru_list l;
         int noswap = 0;
-       int force_scan = 0;
-       unsigned long nr_force_scan[2];
-
+       bool force_scan = false;
  
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
-       if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-               /* kswapd does zone balancing and need to scan this zone */
-               if (scanning_global_lru(sc) && current_is_kswapd())
-                       force_scan = 1;
-               /* memcg may have small limit and need to avoid priority drop */
-               if (!scanning_global_lru(sc))
-                       force_scan = 1;
-       }
+       /*
+        * If the zone or memcg is small, nr[l] can be 0.  This
+        * results in no scanning on this priority and a potential
+        * priority drop.  Global direct reclaim can go to the next
+        * zone and tends to have no problems. Global kswapd is for
+        * zone balancing and it needs to scan a minimum amount. When
+        * reclaiming for a memcg, a priority drop can cause high
+        * latencies, so it's better to scan a minimum amount there as
+        * well.
+        */
+       if (scanning_global_lru(sc) && current_is_kswapd())
+               force_scan = true;
+       if (!scanning_global_lru(sc))
+               force_scan = true;
  
         /* If we have no swap space, do not bother scanning anon pages. */
         if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                 fraction[0] = 0;
                 fraction[1] = 1;
                 denominator = 1;
-               nr_force_scan[0] = 0;
-               nr_force_scan[1] = SWAP_CLUSTER_MAX;
                 goto out;
         }
  
+       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
         if (scanning_global_lru(sc)) {
                 free  = zone_page_state(zone, NR_FREE_PAGES);
                 /* If we have very few page cache pages,
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                         fraction[0] = 1;
                         fraction[1] = 0;
                         denominator = 1;
-                       nr_force_scan[0] = SWAP_CLUSTER_MAX;
-                       nr_force_scan[1] = 0;
                         goto out;
                 }
         }
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         fraction[0] = ap;
         fraction[1] = fp;
         denominator = ap + fp + 1;
-       if (force_scan) {
-               unsigned long scan = SWAP_CLUSTER_MAX;
-               nr_force_scan[0] = div64_u64(scan * ap, denominator);
-               nr_force_scan[1] = div64_u64(scan * fp, denominator);
-       }
  out:
         for_each_evictable_lru(l) {
                 int file = is_file_lru(l);
@@ -1908,20 +1957,10 @@ out:
                 scan = zone_nr_lru_pages(zone, sc, l);
                 if (priority || noswap) {
                         scan >>= priority;
+                       if (!scan && force_scan)
+                               scan = SWAP_CLUSTER_MAX;
                         scan = div64_u64(scan * fraction[file], denominator);
                 }
-
-               /*
-                * If zone is small or memcg is small, nr[l] can be 0.
-                * This results no-scan on this priority and priority drop down.
-                * For global direct reclaim, it can visit next zone and tend
-                * not to have problems. For global kswapd, it's for zone
-                * balancing and it need to scan a small amounts. When using
-                * memcg, priority drop can cause big latency. So, it's better
-                * to scan small amount. See may_noscan above.
-                */
-               if (!scan && force_scan)
-                       scan = nr_force_scan[file];
                 nr[l] = scan;
         }
  }
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
         enum lru_list l;
         unsigned long nr_reclaimed, nr_scanned;
         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
  
  restart:
         nr_reclaimed = 0;
         nr_scanned = sc->nr_scanned;
         get_scan_count(zone, sc, nr, priority);
  
+       blk_start_plug(&plug);
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                         nr[LRU_INACTIVE_FILE]) {
                 for_each_evictable_lru(l) {
@@ -2029,6 +2070,7 @@ restart:
                 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                         break;
         }
+       blk_finish_plug(&plug);
         sc->nr_reclaimed += nr_reclaimed;
  
         /*
@@ -2083,6 +2125,22 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                 continue;
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory
+                                * free for compaction, don't free any
+                                * more.  Even though compaction is
+                                * invoked for any non-zero order,
+                                * only frequent costly order
+                                * reclamation is disruptive enough to
+                                * become a noticable problem, like
+                                * transparent huge page allocations.
+                                */
+                               if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                       (compaction_suitable(zone, sc->order) ||
+                                        compaction_deferred(zone)))
+                                       continue;
+                       }
                         /*
                          * This steals pages from memory cgroups over softlimit
                          * and returns the number of reclaimed pages and
@@ -2268,10 +2326,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  
  unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
-                                       gfp_t gfp_mask, bool noswap,
-                                       struct zone *zone,
-                                       struct memcg_scanrecord *rec,
-                                       unsigned long *scanned)
+                                               gfp_t gfp_mask, bool noswap,
+                                               struct zone *zone,
+                                               unsigned long *nr_scanned)
  {
         struct scan_control sc = {
                 .nr_scanned = 0,
@@ -2281,9 +2338,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                 .may_swap = !noswap,
                 .order = 0,
                 .mem_cgroup = mem,
-               .memcg_record = rec,
         };
-       ktime_t start, end;
  
         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2347,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                       sc.may_writepage,
                                                       sc.gfp_mask);
  
-       start = ktime_get();
         /*
          * NOTE: Although we can get the priority field, using it
          * here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2355,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
          * the priority and make it zero.
          */
         shrink_zone(0, zone, &sc);
-       end = ktime_get();
-
-       if (rec)
-               rec->elapsed += ktime_to_ns(ktime_sub(end, start));
-       *scanned = sc.nr_scanned;
  
         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  
+       *nr_scanned = sc.nr_scanned;
         return sc.nr_reclaimed;
  }
  
  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                            gfp_t gfp_mask,
-                                          bool noswap,
-                                          struct memcg_scanrecord *rec)
+                                          bool noswap)
  {
         struct zonelist *zonelist;
         unsigned long nr_reclaimed;
-       ktime_t start, end;
         int nid;
         struct scan_control sc = {
                 .may_writepage = !laptop_mode,
@@ -2328,7 +2376,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
                 .order = 0,
                 .mem_cgroup = mem_cont,
-               .memcg_record = rec,
                 .nodemask = NULL, /* we don't care the placement */
                 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2384,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                 .gfp_mask = sc.gfp_mask,
         };
  
-       start = ktime_get();
         /*
          * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
          * take care of from where we get pages. So the node where we start the
@@ -2352,9 +2398,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                             sc.gfp_mask);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
-       end = ktime_get();
-       if (rec)
-               rec->elapsed += ktime_to_ns(ktime_sub(end, start));
  
         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
  
@@ -2722,6 +2765,8 @@ out:
  
                         /* If balanced, clear the congested flag */
                         zone_clear_flag(zone, ZONE_CONGESTED);
+                       if (i <= *classzone_idx)
+                               balanced += zone->present_pages;
                 }
         }
  
@@ -2795,7 +2840,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  static int kswapd(void *p)
  {
         unsigned long order, new_order;
+       unsigned balanced_order;
         int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -2826,7 +2873,9 @@ static int kswapd(void *p)
         set_freezable();
  
         order = new_order = 0;
+       balanced_order = 0;
         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
         for ( ; ; ) {
                 int ret;
  
@@ -2835,7 +2884,8 @@ static int kswapd(void *p)
                  * new request of a similar or harder type will succeed soon
                  * so consider going to sleep on the basis we reclaimed at
                  */
-               if (classzone_idx >= new_classzone_idx && order == new_order) {
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
                         new_order = pgdat->kswapd_max_order;
                         new_classzone_idx = pgdat->classzone_idx;
                         pgdat->kswapd_max_order =  0;
@@ -2850,9 +2900,12 @@ static int kswapd(void *p)
                         order = new_order;
                         classzone_idx = new_classzone_idx;
                 } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                         order = pgdat->kswapd_max_order;
                         classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
                         pgdat->kswapd_max_order = 0;
                         pgdat->classzone_idx = pgdat->nr_zones - 1;
                 }
@@ -2867,7 +2920,9 @@ static int kswapd(void *p)
                  */
                 if (!ret) {
                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       order = balance_pgdat(pgdat, order, &classzone_idx);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                 }
         }
         return 0;
@@ -3379,66 +3434,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
  
  }
  
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
-{
-       struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
-       unsigned long scan;
-       unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
-       while (nr_to_scan > 0) {
-               unsigned long batch_size = min(nr_to_scan,
-                                               SCAN_UNEVICTABLE_BATCH_SIZE);
-
-               spin_lock_irq(&zone->lru_lock);
-               for (scan = 0;  scan < batch_size; scan++) {
-                       struct page *page = lru_to_page(l_unevictable);
-
-                       if (!trylock_page(page))
-                               continue;
-
-                       prefetchw_prev_lru_page(page, l_unevictable, flags);
-
-                       if (likely(PageLRU(page) && PageUnevictable(page)))
-                               check_move_unevictable_page(page, zone);
-
-                       unlock_page(page);
-               }
-               spin_unlock_irq(&zone->lru_lock);
-
-               nr_to_scan -= batch_size;
-       }
-}
-
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
+static void warn_scan_unevictable_pages(void)
  {
-       struct zone *zone;
-
-       for_each_zone(zone) {
-               scan_zone_unevictable_pages(zone);
-       }
+       printk_once(KERN_WARNING
+                   "The scan_unevictable_pages sysctl/node-interface has been "
+                   "disabled for lack of a legitimate use case.  If you have "
+                   "one, please send an email to linux-mm@kvack.org.\n");
  }
  
  /*
@@ -3451,11 +3452,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                            void __user *buffer,
                            size_t *length, loff_t *ppos)
  {
+       warn_scan_unevictable_pages();
         proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
-       if (write && *(unsigned long *)table->data)
-               scan_all_zones_unevictable_pages();
-
         scan_unevictable_pages = 0;
         return 0;
  }
@@ -3470,6 +3468,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
                                           struct sysdev_attribute *attr,
                                           char *buf)
  {
+       warn_scan_unevictable_pages();
         return sprintf(buf, "0\n");     /* always zero; should fit... */
  }
  
@@ -3477,19 +3476,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
                                            struct sysdev_attribute *attr,
                                         const char *buf, size_t count)
  {
-       struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
-       struct zone *zone;
-       unsigned long res;
-       unsigned long req = strict_strtoul(buf, 10, &res);
-
-       if (!req)
-               return 1;       /* zero is no-op */
-
-       for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-               if (!populated_zone(zone))
-                       continue;
-               scan_zone_unevictable_pages(zone);
-       }
+       warn_scan_unevictable_pages();
         return 1;
  }