#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
+#include <linux/psi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-/*
- * A cached value of the page's pageblock's migratetype, used when the page is
- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
- * Also the migratetype set in the page does not necessarily match the pcplist
- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
- * other index - this ensures that it will be put on the correct CMA freelist.
- */
-static inline int get_pcppage_migratetype(struct page *page)
-{
- return page->index;
-}
-
-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
-{
- page->index = migratetype;
-}
-
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
#endif
};
+/*
+ * Try to keep at least this much lowmem free. Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
int watermark_scale_factor = 10;
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
/*
* Check tail pages before head page information is cleared to
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
+ * If gfp mask of the page allocation has GFP_HIGHUSER_MOVABLE, @migratetype
+ * is changed from MIGRATE_MOVABLE to MIGRATE_CMA in rmqueue() to select the
+ * free list of MIGRATE_CMA. It helps depleting CMA free pages so that
+ * evaluation of watermark for unmovable page allocations is not too different
+ * from movable page allocations.
+ * If @migratetype is MIGRATE_CMA, it should be corrected to MIGRATE_MOVABLE
+ * after the free list of MIGRATE_CMA is searched to have a chance to search the
+ * free list of MIGRATE_MOVABLE. It also records correct migrate type in the
+ * trace as intended by the page allocation.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
- struct page *page;
+ struct page *page = NULL;
-retry:
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
- page = __rmqueue_cma_fallback(zone, order);
+#ifdef CONFIG_CMA
+ if (migratetype == MIGRATE_CMA) {
+#else
+ if (migratetype == MIGRATE_MOVABLE) {
+#endif
+ page = __rmqueue_cma_fallback(zone, order);
+ migratetype = MIGRATE_MOVABLE;
+ }
- if (!page && __rmqueue_fallback(zone, order, migratetype))
- goto retry;
+ while (!page) {
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (unlikely(!page) &&
+ !__rmqueue_fallback(zone, order, migratetype))
+ break;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
- /* Workqueues cannot recurse */
- if (current->flags & PF_WQ_WORKER)
- return;
-
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
-#ifdef CONFIG_KMEMCHECK
- /*
- * Split shadow pages too, because free(page[0]) would
- * otherwise free the whole shadow.
- */
- if (kmemcheck_page_is_tracked(page))
- split_page(virt_to_page(page[0].shadow), order);
-#endif
-
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order);
* exists.
*/
watermark = min_wmark_pages(zone) + (1UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
else
page = list_first_entry(list, struct page, lru);
+ /*
+ * If the head or the tail page in the pcp list is CMA page and
+ * the gfp flags is not GFP_HIGHUSER_MOVABLE, do not allocate a
+ * page from the pcp list. The free list of MIGRATE_CMA is a
+ * special case of the free list of MIGRATE_MOVABLE and the
+ * pages from the free list of MIGRATE_CMA are pushed to the pcp
+ * list of MIGRATE_MOVABLE. Since the pcp list of
+ * MIGRATE_MOVABLE is selected if the gfp flags has GFP_MOVABLE,
+ * we should avoid the case that a cma page in the pcp list of
+ * MIGRATE_MOVABLE is allocated to a movable allocation without
+ * GFP_HIGHUSER_MOVABLE.
+ * If this is the case, allocate a movable page from the free
+ * list of MIGRATE_MOVABLE instead of pcp list of
+ * MIGRATE_MOVABLE.
+ */
+#ifdef CONFIG_CMA
+ if (is_migrate_cma_page(page) && (migratetype != MIGRATE_CMA))
+ return NULL;
+#endif
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int migratetype,
+ int migratetype_rmqueue)
{
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype_rmqueue, cold, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
{
unsigned long flags;
struct page *page;
+ int migratetype_rmqueue = migratetype;
+#ifdef CONFIG_CMA
+ if ((migratetype_rmqueue == MIGRATE_MOVABLE) &&
+ ((gfp_flags & GFP_HIGHUSER_MOVABLE) == GFP_HIGHUSER_MOVABLE))
+ migratetype_rmqueue = MIGRATE_CMA;
+#endif
if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype);
- goto out;
+ gfp_flags, migratetype, migratetype_rmqueue);
+ /*
+ * Allocation with GFP_MOVABLE and !GFP_HIGHMEM will have
+ * another chance of page allocation from the free list.
+ * See the comment in __rmqueue_pcplist().
+ */
+#ifdef CONFIG_CMA
+ if (likely(page) || (migratetype_rmqueue != MIGRATE_MOVABLE))
+#endif
+ goto out;
}
/*
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype_rmqueue);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
if (!area->nr_free)
continue;
- if (alloc_harder)
- return true;
-
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!list_empty(&area->free_list[mt]))
return true;
return true;
}
#endif
+ if (alloc_harder &&
+ !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ return true;
}
return false;
}
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
+ unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
return false;
/* this guy won't enter reclaim */
- if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ if (current->flags & PF_MEMALLOC)
return false;
/* We're only interested __GFP_FS allocations for now */
struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
+ unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = NULL;
fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
cond_resched();
alloc_flags |= ALLOC_HARDER;
#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if ((gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) ||
+ ((gfp_mask & GFP_HIGHUSER_MOVABLE) == GFP_HIGHUSER_MOVABLE))
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
- unsigned long alloc_start = jiffies;
- unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
int reserve_flags;
- /*
- * In the slowpath, we sanity check order to avoid ever trying to
- * reclaim >= MAX_ORDER areas which will never succeed. Callers may
- * be using allocators in order of preference for an area that is
- * too large.
- */
- if (order >= MAX_ORDER) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
- return NULL;
- }
-
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
* orientated.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
- ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
if (!can_direct_reclaim)
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
+ /*
+ * There are several places where we assume that the order value is sane
+ * so bail out early if the request is out of bound.
+ */
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
page = NULL;
}
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, size);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
nc->offset = size;
}
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, size + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
offset = size - fragsz;
}
min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
wmark_low);
+ /*
+ * Part of the kernel memory, which can be released under memory
+ * pressure.
+ */
+ available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+ PAGE_SHIFT;
+
if (available < 0)
available = 0;
return available;
if (context != MEMMAP_EARLY)
goto not_early;
- if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Skip to the pfn preceding the next valid one (or
- * end_pfn), such that we hit a valid pfn (or end_pfn)
- * on our next iteration of the loop.
- */
- pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+ if (!early_pfn_valid(pfn))
continue;
- }
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
- pgdat->nr_zones = zone_idx(zone) + 1;
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
}
for_each_zone(zone) {
- u64 tmp;
+ u64 min, low;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
- do_div(tmp, lowmem_pages);
+ min = (u64)pages_min * zone->managed_pages;
+ do_div(min, lowmem_pages);
+ low = (u64)pages_low * zone->managed_pages;
+ do_div(low, vm_total_pages);
+
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->watermark[WMARK_MIN] = min;
}
/*
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
- tmp = max_t(u64, tmp >> 2,
+ min = max_t(u64, min >> 2,
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
+ low + min;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+ low + min * 2;
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
- * changes.
+ * or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool drain)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned long nr_reclaimed;
unsigned int tries = 0;
int ret = 0;
- migrate_prep();
+ if (drain)
+ migrate_prep();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CMA);
+ NULL, 0, cc->mode, drain ? MR_CMA : MR_HPA);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
-int alloc_contig_range(unsigned long start, unsigned long end,
- unsigned migratetype, gfp_t gfp_mask)
+int __alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype, gfp_t gfp_mask, bool drain)
{
unsigned long outer_start, outer_end;
unsigned int order;
/*
* In case of -EBUSY, we'd like to know which page causes problem.
- * So, just fall through. We will check it in test_pages_isolated().
+ * So, just fall through. test_pages_isolated() has a tracepoint
+ * which will report the busy page.
+ *
+ * It is possible that busy pages could become available before
+ * the call to test_pages_isolated, and the range will actually be
+ * allocated. So, if we fall through be sure to clear ret so that
+ * -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end);
+ ret = __alloc_contig_migrate_range(&cc, start, end, drain);
if (ret && ret != -EBUSY)
goto done;
+ ret =0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES
* isolated thus they won't get removed from buddy.
*/
- lru_add_drain_all();
- drain_all_pages(cc.zone);
-
order = 0;
outer_start = start;
- while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order >= MAX_ORDER) {
- outer_start = start;
- break;
+
+ if (drain) {
+ lru_add_drain_all();
+ drain_all_pages(cc.zone);
+
+ while (!PageBuddy(pfn_to_page(outer_start))) {
+ if (++order >= MAX_ORDER) {
+ outer_start = start;
+ break;
+ }
+ outer_start &= ~0UL << order;
}
- outer_start &= ~0UL << order;
- }
- if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ if (outer_start != start) {
+ order = page_order(pfn_to_page(outer_start));
- /*
- * outer_start page could be small order buddy page and
- * it doesn't include start page. Adjust outer_start
- * in this case to report failed page properly
- * on tracepoint in test_pages_isolated()
- */
- if (outer_start + (1UL << order) <= start)
- outer_start = start;
- }
+ /*
+ * outer_start page could be small order buddy page and
+ * it doesn't include start page. Adjust outer_start
+ * in this case to report failed page properly
+ * on tracepoint in test_pages_isolated()
+ */
+ if (outer_start + (1UL << order) <= start)
+ outer_start = start;
+ }
- /* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, false)) {
- pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
- __func__, outer_start, end);
- ret = -EBUSY;
- goto done;
+ /* Make sure the range is really isolated. */
+ if (test_pages_isolated(outer_start, end, false)) {
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
+ __func__, outer_start, end);
+ ret = -EBUSY;
+ goto done;
+ }
}
/* Grab isolated pages from freelists. */
return ret;
}
+int alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype, gfp_t gfp_mask)
+{
+ return __alloc_contig_range(start, end, migratetype, gfp_mask, true);
+}
+
+int alloc_contig_range_fast(unsigned long start, unsigned long end,
+ unsigned migratetype)
+{
+ return __alloc_contig_range(start, end, migratetype, GFP_KERNEL, false);
+}
+
void free_contig_range(unsigned long pfn, unsigned nr_pages)
{
unsigned int count = 0;