mm: send one IPI per CPU to TLB flush all entries after unmapping pages

author Mel Gorman <mgorman@suse.de>

Fri, 4 Sep 2015 22:47:32 +0000 (15:47 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)
author Mel Gorman <mgorman@suse.de>
Fri, 4 Sep 2015 22:47:32 +0000 (15:47 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 48f7433dac6f5ac4e76b6a8841b3db0d45c5f19d..117e2f373e50d40cee118dcab42a0e86a617b2ee 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
         select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
         select ARCH_USE_QUEUED_RWLOCKS
         select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
         select ARCH_WANTS_DYNAMIC_TASK_STRUCT
         select ARCH_WANT_FRAME_POINTERS
         select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h

index cd791948b286a13a7c5cf35e71662cb8066d697a..6df2029405a3ae55df8b9718dd320b55dde5c1ad 100644 (file)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
  
  #endif /* SMP */
  
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {            \
+       inc_irq_stat(irq_tlb_count);    \
+       local_flush_tlb();              \
+}
+
  #ifndef CONFIG_PARAVIRT
  #define flush_tlb_others(mask, mm, start, end) \
         native_flush_tlb_others(mask, mm, start, end)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index c89c53a113a8d59c6a427139d220b9d75d6de7d7..29446aeef36e553aa361774d39c0852517c87405 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
         TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
         TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
         TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+       TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+                                        * and caller guarantees they will
+                                        * do a final flush if necessary */
  };
  
  #ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 119823decc4631eb26842df9fd7a9a1e63709577..3c602c20c717cde86519bfe52376d69c6558ef22 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
         perf_nr_task_contexts,
  };
  
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+       /*
+        * Each bit set is a CPU that potentially has a TLB entry for one of
+        * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+        */
+       struct cpumask cpumask;
+
+       /* True if any bit in cpumask is set */
+       bool flush_required;
+};
+
  struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
         unsigned long numa_pages_migrated;
  #endif /* CONFIG_NUMA_BALANCING */
  
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
         struct rcu_head rcu;
  
         /*
diff --git a/init/Kconfig b/init/Kconfig

index 161acd8bc56fc5cc5bcf0ecd830e9ffcfa574873..cf7e4824c8d0d41c00bcb0444f74a8fdb404af4d 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -882,6 +882,16 @@ config GENERIC_SCHED_CLOCK
  config ARCH_SUPPORTS_NUMA_BALANCING
         bool
  
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+       bool
+
  #
  # For architectures that know their GCC __int128 support is sound
  #
diff --git a/mm/internal.h b/mm/internal.h

index 36b23f1e2ca62612e6e1d1b2b9d74c3cd7e87db7..bd6372ac5f7f77af416f1884c39926de1ad8ab46 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
  #define ALLOC_FAIR             0x100 /* fair zone allocation */
  
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
  #endif /* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c

index 171b68768df1478355bcddd5e30c2edd616ba05b..326d5d89e45cdadeb93131c2841208e4c135e052 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
  
  #include <asm/tlbflush.h>
  
+#include <trace/events/tlb.h>
+
  #include "internal.h"
  
  static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma)
         return address;
  }
  
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+       /*
+        * All TLB entries are flushed on the assumption that it is
+        * cheaper to flush all TLBs and let them be refilled than
+        * flushing individual PFNs. Note that we do not track mm's
+        * to flush as that might simply be multiple full TLB flushes
+        * for no gain.
+        */
+       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+       flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+       int cpu;
+
+       if (!tlb_ubc->flush_required)
+               return;
+
+       cpu = get_cpu();
+
+       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+       if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+               percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+       if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+               smp_call_function_many(&tlb_ubc->cpumask,
+                       percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+       }
+       cpumask_clear(&tlb_ubc->cpumask);
+       tlb_ubc->flush_required = false;
+       put_cpu();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+       cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+       tlb_ubc->flush_required = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       bool should_defer = false;
+
+       if (!(flags & TTU_BATCH_FLUSH))
+               return false;
+
+       /* If remote CPUs need to be flushed then defer batch the flush */
+       if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+               should_defer = true;
+       put_cpu();
+
+       return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
  /*
   * At what user virtual address is page expected in vma?
   * Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
  
         /* Nuke the page table entry. */
         flush_cache_page(vma, address, page_to_pfn(page));
-       pteval = ptep_clear_flush(vma, address, pte);
+       if (should_defer_flush(mm, flags)) {
+               /*
+                * We clear the PTE but do not flush so potentially a remote
+                * CPU could still be writing to the page. If the entry was
+                * previously clean then the architecture must guarantee that
+                * a clear->dirty transition on a cached TLB entry is written
+                * through and traps if the PTE is unmapped.
+                */
+               pteval = ptep_get_and_clear(mm, address, pte);
+
+               /* Potentially writable TLBs must be flushed before IO */
+               if (pte_dirty(pteval))
+                       flush_tlb_page(vma, address);
+               else
+                       set_tlb_ubc_flush_pending(mm, page);
+       } else {
+               pteval = ptep_clear_flush(vma, address, pte);
+       }
  
         /* Move the dirty bit to the physical page now the pte is gone. */
         if (pte_dirty(pteval))
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 8286938c70ded6b82d4268174c92669a90eeb674..99ec00d6a5dd6ed910c5935b957f301e58266830 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  * processes. Try to unmap it here.
                  */
                 if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, ttu_flags)) {
+                       switch (try_to_unmap(page,
+                                       ttu_flags|TTU_BATCH_FLUSH)) {
                         case SWAP_FAIL:
                                 goto activate_locked;
                         case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
         }
  
         mem_cgroup_uncharge_list(&free_pages);
+       try_to_unmap_flush();
         free_hot_cold_page_list(&free_pages, true);
  
         list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
         }
  }
  
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+       /*
+        * This deliberately does not clear the cpumask as it's expensive
+        * and unnecessary. If there happens to be data in there then the
+        * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+        * then will be cleared.
+        */
+       current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
  /*
   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
   */
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
         scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                          sc->priority == DEF_PRIORITY);
  
+       init_tlb_ubc();
+
         blk_start_plug(&plug);
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                         nr[LRU_INACTIVE_FILE]) {
author	Mel Gorman <mgorman@suse.de>
	Fri, 4 Sep 2015 22:47:32 +0000 (15:47 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Sep 2015 23:54:41 +0000 (16:54 -0700)
arch/x86/Kconfig		patch \| blob \| blame \| history
arch/x86/include/asm/tlbflush.h		patch \| blob \| blame \| history
include/linux/rmap.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
mm/internal.h		patch \| blob \| blame \| history
mm/rmap.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history