mm: numa: Add pte updates, hinting and migration stats
authorMel Gorman <mgorman@suse.de>
Fri, 2 Nov 2012 14:52:48 +0000 (14:52 +0000)
committerMel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:48 +0000 (14:42 +0000)
It is tricky to quantify the basic cost of automatic NUMA placement in a
meaningful manner. This patch adds some vmstats that can be used as part
of a basic costing model.

u    = basic unit = sizeof(void *)
Ca   = cost of struct page access = sizeof(struct page) / u
Cpte = Cost PTE access = Ca
Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock)
where Cpte is incurred twice for a read and a write and Wlock
is a constant representing the cost of taking or releasing a
lock
Cnumahint = Cost of a minor page fault = some high constant e.g. 1000
Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u
Ci = Cost of page isolation = Ca + Wi
where Wi is a constant that should reflect the approximate cost
of the locking operation
Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma)
where Wnuma is the approximate NUMA factor. 1 is local. 1.2
would imply that remote accesses are 20% more expensive

Balancing cost = Cpte * numa_pte_updates +
Cnumahint * numa_hint_faults +
Ci * numa_pages_migrated +
Cpagecopy * numa_pages_migrated

Note that numa_pages_migrated is used as a measure of how many pages
were isolated even though it would miss pages that failed to migrate. A
vmstat counter could have been added for it but the isolation cost is
pretty marginal in comparison to the overall cost so it seemed overkill.

The ideal way to measure automatic placement benefit would be to count
the number of remote accesses versus local accesses and do something like

benefit = (remote_accesses_before - remove_access_after) * Wnuma

but the information is not readily available. As a workload converges, the
expection would be that the number of remote numa hints would reduce to 0.

convergence = numa_hint_faults_local / numa_hint_faults
where this is measured for the last N number of
numa hints recorded. When the workload is fully
converged the value is 1.

This can measure if the placement policy is converging and how fast it is
doing it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
include/linux/vm_event_item.h
include/linux/vmstat.h
mm/huge_memory.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/vmstat.c

index a1f750b8e72a7591474ccb78b242bf72a6ff77ab..55600049e794970a7d9ac89909eba7f4a2a71b3c 100644 (file)
@@ -38,6 +38,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
                KSWAPD_SKIP_CONGESTION_WAIT,
                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_NUMA_BALANCING
+               NUMA_PTE_UPDATES,
+               NUMA_HINT_FAULTS,
+               NUMA_HINT_FAULTS_LOCAL,
+               NUMA_PAGE_MIGRATE,
+#endif
 #ifdef CONFIG_MIGRATION
                PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
 #endif
index 92a86b2cce33f5bda884b98c9e3ace0412ed2319..a13291f7da887af2c881f54b60e227e9653e5d72 100644 (file)
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)
 
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 
+#ifdef CONFIG_NUMA_BALANCING
+#define count_vm_numa_event(x)     count_vm_event(x)
+#define count_vm_numa_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_numa_event(x) do {} while (0)
+#define count_vm_numa_events(x, y) do {} while (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
 #define __count_zone_vm_events(item, zone, delta) \
                __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
                zone_idx(zone), delta)
index ee8133794a564ae8aaf263631339853752d99512..f3a477fffd09b54fad0eb5b3381a58960d76e9d1 100644 (file)
@@ -1026,6 +1026,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page = NULL;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        int target_nid;
+       int current_nid = -1;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1034,6 +1035,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = pmd_page(pmd);
        get_page(page);
        spin_unlock(&mm->page_table_lock);
+       current_nid = page_to_nid(page);
+       count_vm_numa_event(NUMA_HINT_FAULTS);
+       if (current_nid == numa_node_id())
+               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
        target_nid = mpol_misplaced(page, vma, haddr);
        if (target_nid == -1)
index 8012c1907895612008d0f58eb85c44a05b6d6bc9..8a7b4ccbe136e4474e4bb7469e4d956e5b6ca3ca 100644 (file)
@@ -3477,6 +3477,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
 
+       count_vm_numa_event(NUMA_HINT_FAULTS);
        page = vm_normal_page(vma, addr, pte);
        if (!page) {
                pte_unmap_unlock(ptep, ptl);
@@ -3485,6 +3486,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        get_page(page);
        current_nid = page_to_nid(page);
+       if (current_nid == numa_node_id())
+               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
        target_nid = mpol_misplaced(page, vma, addr);
        pte_unmap_unlock(ptep, ptl);
        if (target_nid == -1) {
@@ -3517,6 +3520,9 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long offset;
        spinlock_t *ptl;
        bool numa = false;
+       int local_nid = numa_node_id();
+       unsigned long nr_faults = 0;
+       unsigned long nr_faults_local = 0;
 
        spin_lock(&mm->page_table_lock);
        pmd = *pmdp;
@@ -3565,10 +3571,16 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                curr_nid = page_to_nid(page);
                task_numa_fault(curr_nid, 1);
 
+               nr_faults++;
+               if (curr_nid == local_nid)
+                       nr_faults_local++;
+
                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);
 
+       count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults);
+       count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local);
        return 0;
 }
 #else
index a7a62fe7c2803e395b6db3a47a51fbf30b6212e4..516491fbfaa8d9447b9e8d2ce8d3521c940e8ec6 100644 (file)
@@ -583,6 +583,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 
        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+       if (nr_updated)
+               count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 
        return nr_updated;
 }
index c7d550011a640b95d8e46f54a9a9185e6d502b75..23bba5d6edffc6c778a59a1fee7cc13cf19f7dec 100644 (file)
@@ -1514,7 +1514,8 @@ int migrate_misplaced_page(struct page *page, int node)
                if (nr_remaining) {
                        putback_lru_pages(&migratepages);
                        isolated = 0;
-               }
+               } else
+                       count_vm_numa_event(NUMA_PAGE_MIGRATE);
        }
        BUG_ON(!list_empty(&migratepages));
 out:
index 3a067fabe190ba9d3a824cc8746811854c9c58ec..c0f1f6db5182ef148ee23926a89cde243b46bab9 100644 (file)
@@ -774,6 +774,12 @@ const char * const vmstat_text[] = {
 
        "pgrotated",
 
+#ifdef CONFIG_NUMA_BALANCING
+       "numa_pte_updates",
+       "numa_hint_faults",
+       "numa_hint_faults_local",
+       "numa_pages_migrated",
+#endif
 #ifdef CONFIG_MIGRATION
        "pgmigrate_success",
        "pgmigrate_fail",