Linux memory policy supports the following 4 behavioral modes:
- Default Mode--MPOL_DEFAULT: The behavior specified by this mode is
- context or scope dependent.
-
- As mentioned in the Policy Scope section above, during normal
- system operation, the System Default Policy is hard coded to
- contain the Default mode.
-
- In this context, default mode means "local" allocation--that is
- attempt to allocate the page from the node associated with the cpu
- where the fault occurs. If the "local" node has no memory, or the
- node's memory can be exhausted [no free pages available], local
- allocation will "fallback to"--attempt to allocate pages from--
- "nearby" nodes, in order of increasing "distance".
-
- Implementation detail -- subject to change: "Fallback" uses
- a per node list of sibling nodes--called zonelists--built at
- boot time, or when nodes or memory are added or removed from
- the system [memory hotplug]. These per node zonelist are
- constructed with nodes in order of increasing distance based
- on information provided by the platform firmware.
-
- When a task/process policy or a shared policy contains the Default
- mode, this also means "local allocation", as described above.
-
- In the context of a VMA, Default mode means "fall back to task
- policy"--which may or may not specify Default mode. Thus, Default
- mode can not be counted on to mean local allocation when used
- on a non-shared region of the address space. However, see
- MPOL_PREFERRED below.
+ Default Mode--MPOL_DEFAULT: This mode is only used in the memory
+ policy APIs. Internally, MPOL_DEFAULT is converted to the NULL
+ memory policy in all policy scopes. Any existing non-default policy
+ will simply be removed when MPOL_DEFAULT is specified. As a result,
+ MPOL_DEFAULT means "fall back to the next most specific policy scope."
+
+ For example, a NULL or default task policy will fall back to the
+ system default policy. A NULL or default vma policy will fall
+ back to the task policy.
+
+ When specified in one of the memory policy APIs, the Default mode
+ does not use the optional set of nodes.
It is an error for the set of nodes specified for this policy to
be non-empty.
MPOL_PREFERRED: This mode specifies that the allocation should be
attempted from the single node specified in the policy. If that
- allocation fails, the kernel will search other nodes, exactly as
- it would for a local allocation that started at the preferred node
- in increasing distance from the preferred node. "Local" allocation
- policy can be viewed as a Preferred policy that starts at the node
+ allocation fails, the kernel will search other nodes, in order of
+ increasing distance from the preferred node based on information
+ provided by the platform firmware.
containing the cpu where the allocation takes place.
Internally, the Preferred policy uses a single node--the
preferred_node member of struct mempolicy. A "distinguished
value of this preferred_node, currently '-1', is interpreted
as "the node containing the cpu where the allocation takes
- place"--local allocation. This is the way to specify
- local allocation for a specific range of addresses--i.e. for
- VMA policies.
+ place"--local allocation. "Local" allocation policy can be
+ viewed as a Preferred policy that starts at the node containing
+ the cpu where the allocation takes place.
It is possible for the user to specify that local allocation is
always preferred by passing an empty nodemask with this mode.
policied. */
enum zone_type policy_zone = 0;
+/*
+ * run-time system-wide default policy => local allocation
+ */
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_DEFAULT,
+ .mode = MPOL_PREFERRED,
+ .v = { .preferred_node = -1 },
};
static const struct mempolicy_operations {
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL;
+ return NULL; /* simply delete any existing policy */
}
VM_BUG_ON(!nodes);
{
if (!atomic_dec_and_test(&p->refcnt))
return;
- p->mode = MPOL_DEFAULT;
kmem_cache_free(policy_cache, p);
}
return 0;
}
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
+ if (p == &default_policy)
+ return;
+
switch (p->mode) {
- case MPOL_DEFAULT:
- break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
}
if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
return -EINVAL;
if (!pol)
- pol = &default_policy;
+ pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
err = -EINVAL;
goto out;
}
- } else
- *policy = pol->mode | pol->flags;
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ *policy |= pol->flags;
+ }
if (vma) {
up_read(¤t->mm->mmap_sem);
err = 0;
if (nmask)
- get_zonemask(pol, nmask);
+ get_policy_nodemask(pol, nmask);
out:
mpol_cond_put(pol);
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy &&
- vma->vm_policy->mode != MPOL_DEFAULT)
+ } else if (vma->vm_policy)
pol = vma->vm_policy;
}
if (!pol)
nd = first_node(policy->v.nodes);
break;
case MPOL_INTERLEAVE: /* should not happen */
- case MPOL_DEFAULT:
nd = numa_node_id();
break;
default:
*/
unsigned slab_node(struct mempolicy *policy)
{
- unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
+ if (!policy)
+ return numa_node_id();
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ if (unlikely(policy->v.preferred_node >= 0))
+ return policy->v.preferred_node;
+ return numa_node_id();
- switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
return zone->node;
}
- case MPOL_PREFERRED:
- if (policy->v.preferred_node >= 0)
- return policy->v.preferred_node;
- /* Fall through */
-
default:
- return numa_node_id();
+ BUG();
}
}
if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
return 0;
switch (a->mode) {
- case MPOL_DEFAULT:
- return 1;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
if (policy != MPOL_DEFAULT) {
struct mempolicy *newpol;
- /* Falls back to MPOL_DEFAULT on any error */
+ /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
newpol = mpol_new(policy, flags, policy_nodes);
if (!IS_ERR(newpol)) {
/* Create pseudo-vma that contains just the policy */
char *p = buffer;
int l;
nodemask_t nodes;
- unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
+ unsigned short mode;
unsigned short flags = pol ? pol->flags : 0;
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
+ mode = pol->mode;
+
switch (mode) {
case MPOL_DEFAULT:
nodes_clear(nodes);