select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
#endif /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() { \
+ inc_irq_stat(irq_tlb_count); \
+ local_flush_tlb(); \
+}
+
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ * and caller guarantees they will
+ * do a final flush if necessary */
};
#ifdef CONFIG_MMU
perf_nr_task_contexts,
};
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ */
+ struct cpumask cpumask;
+
+ /* True if any bit in cpumask is set */
+ bool flush_required;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
struct rcu_head rcu;
/*
config ARCH_SUPPORTS_NUMA_BALANCING
bool
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ bool
+
#
# For architectures that know their GCC __int128 support is sound
#
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
#endif /* __MM_INTERNAL_H */
#include <asm/tlbflush.h>
+#include <trace/events/tlb.h>
+
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
return address;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+ /*
+ * All TLB entries are flushed on the assumption that it is
+ * cheaper to flush all TLBs and let them be refilled than
+ * flushing individual PFNs. Note that we do not track mm's
+ * to flush as that might simply be multiple full TLB flushes
+ * for no gain.
+ */
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
+ int cpu;
+
+ if (!tlb_ubc->flush_required)
+ return;
+
+ cpu = get_cpu();
+
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+ percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+ smp_call_function_many(&tlb_ubc->cpumask,
+ percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ }
+ cpumask_clear(&tlb_ubc->cpumask);
+ tlb_ubc->flush_required = false;
+ put_cpu();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
+
+ cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ tlb_ubc->flush_required = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ bool should_defer = false;
+
+ if (!(flags & TTU_BATCH_FLUSH))
+ return false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+ struct page *page)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* At what user virtual address is page expected in vma?
* Caller should check the page is actually part of the vma.
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush(vma, address, pte);
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially a remote
+ * CPU could still be writing to the page. If the entry was
+ * previously clean then the architecture must guarantee that
+ * a clear->dirty transition on a cached TLB entry is written
+ * through and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pte);
+
+ /* Potentially writable TLBs must be flushed before IO */
+ if (pte_dirty(pteval))
+ flush_tlb_page(vma, address);
+ else
+ set_tlb_ubc_flush_pending(mm, page);
+ } else {
+ pteval = ptep_clear_flush(vma, address, pte);
+ }
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ switch (try_to_unmap(page,
+ ttu_flags|TTU_BATCH_FLUSH)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
}
mem_cgroup_uncharge_list(&free_pages);
+ try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
}
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+ /*
+ * This deliberately does not clear the cpumask as it's expensive
+ * and unnecessary. If there happens to be data in there then the
+ * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+ * then will be cleared.
+ */
+ current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
+ init_tlb_ubc();
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {