extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+/*
+ * reading current mems_allowed and mempolicy in the fastpath must protected
+ * by get_mems_allowed()
+ */
+static inline void get_mems_allowed(void)
+{
+ current->mems_allowed_change_disable++;
+
+ /*
+ * ensure that reading mems_allowed and mempolicy happens after the
+ * update of ->mems_allowed_change_disable.
+ *
+ * the write-side task finds ->mems_allowed_change_disable is not 0,
+ * and knows the read-side task is reading mems_allowed or mempolicy,
+ * so it will clear old bits lazily.
+ */
+ smp_mb();
+}
+
+static inline void put_mems_allowed(void)
+{
+ /*
+ * ensure that reading mems_allowed and mempolicy before reducing
+ * mems_allowed_change_disable.
+ *
+ * the write-side task will know that the read-side task is still
+ * reading mems_allowed or mempolicy, don't clears old bits in the
+ * nodemask.
+ */
+ smp_mb();
+ --ACCESS_ONCE(current->mems_allowed_change_disable);
+}
+
static inline void set_mems_allowed(nodemask_t nodemask)
{
+ task_lock(current);
current->mems_allowed = nodemask;
+ task_unlock(current);
}
#else /* !CONFIG_CPUSETS */
{
}
+static inline void get_mems_allowed(void)
+{
+}
+
+static inline void put_mems_allowed(void)
+{
+}
+
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
+ int mems_allowed_change_disable;
int cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
* In order to avoid seeing no nodes if the old and new nodes are disjoint,
* we structure updates as setting all new allowed nodes, then clearing newly
* disallowed ones.
- *
- * Called with task's alloc_lock held
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
+repeat:
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return;
+
+ task_lock(tsk);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+
+ /*
+ * ensure checking ->mems_allowed_change_disable after setting all new
+ * allowed nodes.
+ *
+ * the read-side task can see an nodemask with new allowed nodes and
+ * old allowed nodes. and if it allocates page when cpuset clears newly
+ * disallowed ones continuous, it can see the new allowed bits.
+ *
+ * And if setting all new allowed nodes is after the checking, setting
+ * all new allowed nodes and clearing newly disallowed ones will be done
+ * continuous, and the read-side task may find no node to alloc page.
+ */
+ smp_mb();
+
+ /*
+ * Allocation of memory is very fast, we needn't sleep when waiting
+ * for the read-side.
+ */
+ while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+ task_unlock(tsk);
+ if (!task_curr(tsk))
+ yield();
+ goto repeat;
+ }
+
+ /*
+ * ensure checking ->mems_allowed_change_disable before clearing all new
+ * disallowed nodes.
+ *
+ * if clearing newly disallowed bits before the checking, the read-side
+ * task may find no node to alloc page.
+ */
+ smp_mb();
+
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
+ task_unlock(tsk);
}
/*
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, newmems);
- task_lock(p);
cpuset_change_task_nodemask(p, newmems);
- task_unlock(p);
NODEMASK_FREE(newmems);
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- task_lock(tsk);
cpuset_change_task_nodemask(tsk, to);
- task_unlock(tsk);
cpuset_update_task_spread_flag(cs, tsk);
}
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
+ task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
+ task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
{
+ int n;
+ struct page *page;
+
if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_exact_node(n, gfp, 0);
+ get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ put_mems_allowed();
+ return page;
}
return alloc_pages(gfp, 0);
}
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct zonelist *zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol, &nodemask);
+ struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
*/
if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- return NULL;
+ goto err;;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
break;
}
}
+err:
mpol_cond_put(mpol);
+ put_mems_allowed();
return page;
}
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
+ *
+ * Must be protected by get_mems_allowed()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
if (!(mask && current->mempolicy))
return false;
+ task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
default:
BUG();
}
+ task_unlock(current);
return true;
}
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
+ struct page *page;
+ get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- return alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, 0, nid);
+ put_mems_allowed();
+ return page;
}
zl = policy_zonelist(gfp, pol);
if (unlikely(mpol_needs_cond_ref(pol))) {
struct page *page = __alloc_pages_nodemask(gfp, 0,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
+ put_mems_allowed();
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
/**
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
+ struct page *page;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
+ get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages_nodemask(gfp, order,
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
EXPORT_SYMBOL(alloc_pages_current);
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
+ if (!preferred_zone) {
+ put_mems_allowed();
return NULL;
+ }
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_node_id();
+ get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_mem_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
+ put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
if (flags & __GFP_THISNODE)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
}
}
}
+ put_mems_allowed();
return obj;
}
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
page = get_partial_node(n);
- if (page)
+ if (page) {
+ put_mems_allowed();
return page;
+ }
}
}
+ put_mems_allowed();
#endif
return NULL;
}
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
+ get_mems_allowed();
delayacct_freepages_start();
if (scanning_global_lru(sc))
mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
delayacct_freepages_end();
+ put_mems_allowed();
return ret;
}