UPSTREAM: mm: workingset: tell cache transitions from workingset thrashing
authorJohannes Weiner <hannes@cmpxchg.org>
Fri, 26 Oct 2018 22:06:04 +0000 (15:06 -0700)
committerwangwang <wangwang1@lenovo.com>
Wed, 13 Nov 2019 02:34:35 +0000 (10:34 +0800)
Refaults happen during transitions between workingsets as well as in-place
thrashing.  Knowing the difference between the two has a range of
applications, including measuring the impact of memory shortage on the
system performance, as well as the ability to smarter balance pressure
between the filesystem cache and the swap-backed workingset.

During workingset transitions, inactive cache refaults and pushes out
established active cache.  When that active cache isn't stale, however,
and also ends up refaulting, that's bonafide thrashing.

Introduce a new page flag that tells on eviction whether the page has been
active or not in its lifetime.  This bit is then stored in the shadow
entry, to classify refaults as transitioning or thrashing.

How many page->flags does this leave us with on 32-bit?

20 bits are always page flags

21 if you have an MMU

23 with the zone bits for DMA, Normal, HighMem, Movable

29 with the sparsemem section bits

30 if PAE is enabled

31 with this patch.

So on 32-bit PAE, that leaves 1 bit for distinguishing two NUMA nodes.  If
that's not enough, the system can switch to discontigmem and re-gain the 6
or 7 sparsemem section bits.

Link: http://lkml.kernel.org/r/20180828172258.3185-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Drake <drake@endlessm.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <jweiner@fb.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Enderborg <peter.enderborg@sony.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
(cherry picked from commit 8508cf3ffad4Defendera202b303e5b6379efc4cd9054)

Bug: 127712811
Test: lmkd in PSI mode
Change-Id: I71df060dce5590a3c654f9a0e8e54deeb74b64c2
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
include/linux/mmzone.h
include/linux/page-flags.h
include/linux/swap.h
include/trace/events/mmflags.h
mm/filemap.c
mm/huge_memory.c
mm/migrate.c
mm/swap_state.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c

index f679f526846756b19ebef509a2d58676d8c8f97b..71b7a8bc82ea4be7498c7d6b1d863fe531bfd1c7 100644 (file)
@@ -163,6 +163,7 @@ enum node_stat_item {
        NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
        WORKINGSET_REFAULT,
        WORKINGSET_ACTIVATE,
+       WORKINGSET_RESTORE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED, /* Mapped anonymous pages */
        NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
index 584b14c774c181e85eac69be4f5a3ad037cb75d4..6900ad07554bd9223faf4ac0c98ae1c7eaf6dad3 100644 (file)
  */
 enum pageflags {
        PG_locked,              /* Page is locked. Don't touch. */
-       PG_error,
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_active,
+       PG_workingset,
        PG_waiters,             /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
+       PG_error,
        PG_slab,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
        PG_arch_1,
@@ -273,6 +274,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
 PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
 PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
        TESTCLEARFLAG(Active, active, PF_HEAD)
+PAGEFLAG(Workingset, workingset, PF_HEAD)
+       TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
 __PAGEFLAG(Slab, slab, PF_NO_TAIL)
 __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
 PAGEFLAG(Checked, checked, PF_NO_COMPOUND)        /* Used by some filesystems */
index e643866912b7025164231e224c89cf39d55b0b62..de8a1fb219d1072b530fdda7b29b53fdc82c5df4 100644 (file)
@@ -304,7 +304,7 @@ struct vma_swap_readahead {
 
 /* linux/mm/workingset.c */
 void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
 void workingset_activation(struct page *page);
 void workingset_update_node(struct radix_tree_node *node, void *private);
 
index 72162f3a03fac22e15f670e872ced6f9008595a3..40b9cc3bfaf983fbd4e8421db727cfc2b5d86490 100644 (file)
@@ -89,6 +89,7 @@
        {1UL << PG_dirty,               "dirty"         },              \
        {1UL << PG_lru,                 "lru"           },              \
        {1UL << PG_active,              "active"        },              \
+       {1UL << PG_workingset,          "workingset"    },              \
        {1UL << PG_slab,                "slab"          },              \
        {1UL << PG_owner_priv_1,        "owner_priv_1"  },              \
        {1UL << PG_arch_1,              "arch_1"        },              \
index 51aef4e992eafe93c01f69a948b2b38d79db4975..c931dea6bdb5fa3d6255fe544fbdb40f6db93051 100644 (file)
@@ -817,12 +817,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
-               if (!(gfp_mask & __GFP_WRITE) &&
-                   shadow && workingset_refault(shadow)) {
-                       SetPageActive(page);
-                       workingset_activation(page);
-               } else
-                       ClearPageActive(page);
+               WARN_ON_ONCE(PageActive(page));
+               if (!(gfp_mask & __GFP_WRITE) && shadow)
+                       workingset_refault(page, shadow);
                lru_cache_add(page);
        }
        return ret;
index 930f2aa3bb4d39151eab9018fbb927f3e7f55ef9..4ef967ad24ec78263559a733f991374134b1a603 100644 (file)
@@ -2327,6 +2327,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
                         (1L << PG_mlocked) |
                         (1L << PG_uptodate) |
                         (1L << PG_active) |
+                        (1L << PG_workingset) |
                         (1L << PG_locked) |
                         (1L << PG_unevictable) |
                         (1L << PG_dirty)));
index ee6949910a8ed26a265cd9b975cd1d7b417be445..5d295c0c3bef06f4fce676e84aaa6cbb3f1cab93 100644 (file)
@@ -671,6 +671,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
                SetPageActive(newpage);
        } else if (TestClearPageUnevictable(page))
                SetPageUnevictable(newpage);
+       if (PageWorkingset(page))
+               SetPageWorkingset(newpage);
        if (PageChecked(page))
                SetPageChecked(newpage);
        if (PageMappedToDisk(page))
index 326439428daffd6c1da5e5cd5b50652ead47568d..3931379fac4d9508e636a891a565572008614099 100644 (file)
@@ -435,6 +435,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        /*
                         * Initiate read into locked page and return.
                         */
+                       SetPageWorkingset(new_page);
                        lru_cache_add_anon(new_page);
                        *new_page_allocated = true;
                        return new_page;
index f9df0265e8596234c072ea02868b76f5c36375ff..c0f1da5cab29a2b9fcb8770f5b0c99fcc409ff7d 100644 (file)
@@ -2056,6 +2056,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                }
 
                ClearPageActive(page);  /* we are de-activating */
+               SetPageWorkingset(page);
                list_add(&page->lru, &l_inactive);
        }
 
index 6389e876c7a74624902f11859480825ee1fc508f..efab0fca0bb7061688eb171c0a35f449bd9a64ec 100644 (file)
@@ -1074,6 +1074,7 @@ const char * const vmstat_text[] = {
        "nr_isolated_file",
        "workingset_refault",
        "workingset_activate",
+       "workingset_restore",
        "workingset_nodereclaim",
        "nr_anon_pages",
        "nr_mapped",
index b997c9de28f6c4074f1ebc1e2f480bbae8ad1df9..808a69a4a9ef4f1d747e5652d6faec17337d9797 100644 (file)
  * the only thing eating into inactive list space is active pages.
  *
  *
- *             Activating refaulting pages
+ *             Refaulting inactive pages
  *
  * All that is known about the active list is that the pages have been
  * accessed more than once in the past.  This means that at any given
  * used less frequently than the refaulting page - or even not used at
  * all anymore.
  *
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
  * If this is wrong and demotion kicks in, the pages which are truly
  * used more frequently will be reactivated while the less frequently
  * used once will be evicted from memory.
  * But if this is right, the stale pages will be pushed out of memory
  * and the used pages get to stay in cache.
  *
+ *             Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
  *
  *             Implementation
  *
  */
 
 #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
-                        NODES_SHIFT +  \
-                        MEM_CGROUP_ID_SHIFT)
+                        1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
 
 /*
  */
 static unsigned int bucket_order __read_mostly;
 
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+                        bool workingset)
 {
        eviction >>= bucket_order;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+       eviction = (eviction << 1) | workingset;
        eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 
        return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
-                         unsigned long *evictionp)
+                         unsigned long *evictionp, bool *workingsetp)
 {
        unsigned long entry = (unsigned long)shadow;
        int memcgid, nid;
+       bool workingset;
 
        entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+       workingset = entry & 1;
+       entry >>= 1;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
        *memcgidp = memcgid;
        *pgdat = NODE_DATA(nid);
        *evictionp = entry << bucket_order;
+       *workingsetp = workingset;
 }
 
 /**
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
  */
 void *workingset_eviction(struct address_space *mapping, struct page *page)
 {
-       struct mem_cgroup *memcg = page_memcg(page);
        struct pglist_data *pgdat = page_pgdat(page);
+       struct mem_cgroup *memcg = page_memcg(page);
        int memcgid = mem_cgroup_id(memcg);
        unsigned long eviction;
        struct lruvec *lruvec;
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
 
        lruvec = mem_cgroup_lruvec(pgdat, memcg);
        eviction = atomic_long_inc_return(&lruvec->inactive_age);
-       return pack_shadow(memcgid, pgdat, eviction);
+       return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
 }
 
 /**
  * workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
  * @shadow: shadow entry of the evicted page
  *
  * Calculates and evaluates the refault distance of the previously
  * evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
  */
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
 {
        unsigned long refault_distance;
+       struct pglist_data *pgdat;
        unsigned long active_file;
        struct mem_cgroup *memcg;
        unsigned long eviction;
        struct lruvec *lruvec;
        unsigned long refault;
-       struct pglist_data *pgdat;
+       bool workingset;
        int memcgid;
 
-       unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+       unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
 
        rcu_read_lock();
        /*
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
         * configurations instead.
         */
        memcg = mem_cgroup_from_id(memcgid);
-       if (!mem_cgroup_disabled() && !memcg) {
-               rcu_read_unlock();
-               return false;
-       }
+       if (!mem_cgroup_disabled() && !memcg)
+               goto out;
        lruvec = mem_cgroup_lruvec(pgdat, memcg);
        refault = atomic_long_read(&lruvec->inactive_age);
        active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
 
        /*
-        * The unsigned subtraction here gives an accurate distance
-        * across inactive_age overflows in most cases.
+        * Calculate the refault distance
         *
-        * There is a special case: usually, shadow entries have a
-        * short lifetime and are either refaulted or reclaimed along
-        * with the inode before they get too old.  But it is not
-        * impossible for the inactive_age to lap a shadow entry in
-        * the field, which can then can result in a false small
-        * refault distance, leading to a false activation should this
-        * old entry actually refault again.  However, earlier kernels
-        * used to deactivate unconditionally with *every* reclaim
-        * invocation for the longest time, so the occasional
-        * inappropriate activation leading to pressure on the active
-        * list is not a problem.
+        * The unsigned subtraction here gives an accurate distance
+        * across inactive_age overflows in most cases. There is a
+        * special case: usually, shadow entries have a short lifetime
+        * and are either refaulted or reclaimed along with the inode
+        * before they get too old.  But it is not impossible for the
+        * inactive_age to lap a shadow entry in the field, which can
+        * then result in a false small refault distance, leading to a
+        * false activation should this old entry actually refault
+        * again.  However, earlier kernels used to deactivate
+        * unconditionally with *every* reclaim invocation for the
+        * longest time, so the occasional inappropriate activation
+        * leading to pressure on the active list is not a problem.
         */
        refault_distance = (refault - eviction) & EVICTION_MASK;
 
        inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
 
-       if (refault_distance <= active_file) {
-               inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
-               rcu_read_unlock();
-               return true;
+       /*
+        * Compare the distance to the existing workingset size. We
+        * don't act on pages that couldn't stay resident even if all
+        * the memory was available to the page cache.
+        */
+       if (refault_distance > active_file)
+               goto out;
+
+       SetPageActive(page);
+       atomic_long_inc(&lruvec->inactive_age);
+       inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+       /* Page was active prior to eviction */
+       if (workingset) {
+               SetPageWorkingset(page);
+               inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
        }
+out:
        rcu_read_unlock();
-       return false;
 }
 
 /**