vmscan: second chance replacement for anonymous pages
authorRik van Riel <riel@redhat.com>
Sun, 19 Oct 2008 03:26:34 +0000 (20:26 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 20 Oct 2008 15:50:25 +0000 (08:50 -0700)
We avoid evicting and scanning anonymous pages for the most part, but
under some workloads we can end up with most of memory filled with
anonymous pages.  At that point, we suddenly need to clear the referenced
bits on all of memory, which can take ages on very large memory systems.

We can reduce the maximum number of pages that need to be scanned by not
taking the referenced state into account when deactivating an anonymous
page.  After all, every anonymous page starts out referenced, so why
check?

If an anonymous page gets referenced again before it reaches the end of
the inactive list, we move it back to the active list.

To keep the maximum amount of necessary work reasonable, we scale the
active to inactive ratio with the size of memory, using the formula
active:inactive ratio = sqrt(memory in GB * 10).

Kswapd CPU use now seems to scale by the amount of pageout bandwidth,
instead of by the amount of memory present in the system.

[kamezawa.hiroyu@jp.fujitsu.com: fix OOM with memcg]
[kamezawa.hiroyu@jp.fujitsu.com: memcg: lru scan fix]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/mm_inline.h
include/linux/mmzone.h
mm/page_alloc.c
mm/vmscan.c
mm/vmstat.c

index 2eb599465d56a5f9dc14b95ccef489673f16ab0f..f451fedd1e75a2520e9eaef7a04dd693b009a7fc 100644 (file)
@@ -117,4 +117,23 @@ static inline enum lru_list page_lru(struct page *page)
        return lru;
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static inline int inactive_anon_is_low(struct zone *zone)
+{
+       unsigned long active, inactive;
+
+       active = zone_page_state(zone, NR_ACTIVE_ANON);
+       inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+       if (inactive * zone->inactive_ratio < active)
+               return 1;
+
+       return 0;
+}
 #endif
index 59a4c8fd6ebdeaf6090b353419c723c1fbca10dc..9c5111f49a32c7c67af347cef5b3b429923433b8 100644 (file)
@@ -323,6 +323,12 @@ struct zone {
         */
        int prev_priority;
 
+       /*
+        * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+        * this zone's LRU.  Maintained by the pageout code.
+        */
+       unsigned int inactive_ratio;
+
 
        ZONE_PADDING(_pad2_)
        /* Rarely used or read-mostly fields */
index 740a16a32c22652059f74a7b53218f2e409a9887..79c0981b1d32c9c11504a0caa5021c9b94be8d1e 100644 (file)
@@ -4263,6 +4263,46 @@ void setup_per_zone_pages_min(void)
        calculate_totalreserve_pages();
 }
 
+/**
+ * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive anon
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
+ */
+void setup_per_zone_inactive_ratio(void)
+{
+       struct zone *zone;
+
+       for_each_zone(zone) {
+               unsigned int gb, ratio;
+
+               /* Zone size in gigabytes */
+               gb = zone->present_pages >> (30 - PAGE_SHIFT);
+               ratio = int_sqrt(10 * gb);
+               if (!ratio)
+                       ratio = 1;
+
+               zone->inactive_ratio = ratio;
+       }
+}
+
 /*
  * Initialise min_free_kbytes.
  *
@@ -4300,6 +4340,7 @@ static int __init init_per_zone_pages_min(void)
                min_free_kbytes = 65536;
        setup_per_zone_pages_min();
        setup_per_zone_lowmem_reserve();
+       setup_per_zone_inactive_ratio();
        return 0;
 }
 module_init(init_per_zone_pages_min)
index d10d2f9a33f39610bea63361b5c7340f2848f019..c82ee9a33cfc7ca2e2ef56b6091a3c2fff96c62b 100644 (file)
@@ -1090,6 +1090,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
 
+       pgmoved = 0;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
@@ -1097,6 +1098,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                list_add(&page->lru, &l_inactive);
        }
 
+       /*
+        * Count the referenced pages as rotated, even when they are moved
+        * to the inactive list.  This helps balance scan pressure between
+        * file and anonymous pages in get_scan_ratio.
+        */
+       zone->recent_rotated[!!file] += pgmoved;
+
        /*
         * Now put the pages back on the appropriate [file or anon] inactive
         * and active lists.
@@ -1158,7 +1166,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
        }
        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
-       zone->recent_rotated[!!file] += pgmoved;
 
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1174,7 +1181,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 {
        int file = is_file_lru(lru);
 
-       if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) {
+       if (lru == LRU_ACTIVE_FILE) {
+               shrink_active_list(nr_to_scan, zone, sc, priority, file);
+               return 0;
+       }
+
+       if (lru == LRU_ACTIVE_ANON &&
+           (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
                shrink_active_list(nr_to_scan, zone, sc, priority, file);
                return 0;
        }
@@ -1310,8 +1323,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                }
        }
 
-       while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] ||
-                       nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+                                       nr[LRU_INACTIVE_FILE]) {
                for_each_lru(l) {
                        if (nr[l]) {
                                nr_to_scan = min(nr[l],
@@ -1324,6 +1337,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                }
        }
 
+       /*
+        * Even if we did not try to evict anon pages at all, we want to
+        * rebalance the anon lru active/inactive ratio.
+        */
+       if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+               shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+       else if (!scan_global_lru(sc))
+               shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+
        throttle_vm_writeout(sc->gfp_mask);
        return nr_reclaimed;
 }
@@ -1617,6 +1639,14 @@ loop_again:
                            priority != DEF_PRIORITY)
                                continue;
 
+                       /*
+                        * Do some background aging of the anon list, to give
+                        * pages a chance to be referenced before reclaiming.
+                        */
+                       if (inactive_anon_is_low(zone))
+                               shrink_active_list(SWAP_CLUSTER_MAX, zone,
+                                                       &sc, priority, 0);
+
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
                                               0, 0)) {
                                end_zone = i;
index 27400b7da7c422de27c595a01d86cf9b068a1750..4380b0dba6d9f7fcd26156f94d3cb8da6729fe56 100644 (file)
@@ -738,10 +738,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        seq_printf(m,
                   "\n  all_unreclaimable: %u"
                   "\n  prev_priority:     %i"
-                  "\n  start_pfn:         %lu",
+                  "\n  start_pfn:         %lu"
+                  "\n  inactive_ratio:    %u",
                           zone_is_all_unreclaimable(zone),
                   zone->prev_priority,
-                  zone->zone_start_pfn);
+                  zone->zone_start_pfn,
+                  zone->inactive_ratio);
        seq_putc(m, '\n');
 }