remove libdss from Makefile
[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / mm / mempolicy.c
1 /*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 */
67
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/sched/mm.h>
77 #include <linux/sched/numa_balancing.h>
78 #include <linux/sched/task.h>
79 #include <linux/nodemask.h>
80 #include <linux/cpuset.h>
81 #include <linux/slab.h>
82 #include <linux/string.h>
83 #include <linux/export.h>
84 #include <linux/nsproxy.h>
85 #include <linux/interrupt.h>
86 #include <linux/init.h>
87 #include <linux/compat.h>
88 #include <linux/swap.h>
89 #include <linux/seq_file.h>
90 #include <linux/proc_fs.h>
91 #include <linux/migrate.h>
92 #include <linux/ksm.h>
93 #include <linux/rmap.h>
94 #include <linux/security.h>
95 #include <linux/syscalls.h>
96 #include <linux/ctype.h>
97 #include <linux/mm_inline.h>
98 #include <linux/mmu_notifier.h>
99 #include <linux/printk.h>
100 #include <linux/swapops.h>
101
102 #include <asm/tlbflush.h>
103 #include <linux/uaccess.h>
104
105 #include "internal.h"
106
107 /* Internal flags */
108 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
109 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
110
111 static struct kmem_cache *policy_cache;
112 static struct kmem_cache *sn_cache;
113
114 /* Highest zone. An specific allocation for a zone below that is not
115 policied. */
116 enum zone_type policy_zone = 0;
117
118 /*
119 * run-time system-wide default policy => local allocation
120 */
121 static struct mempolicy default_policy = {
122 .refcnt = ATOMIC_INIT(1), /* never free it */
123 .mode = MPOL_PREFERRED,
124 .flags = MPOL_F_LOCAL,
125 };
126
127 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
128
129 struct mempolicy *get_task_policy(struct task_struct *p)
130 {
131 struct mempolicy *pol = p->mempolicy;
132 int node;
133
134 if (pol)
135 return pol;
136
137 node = numa_node_id();
138 if (node != NUMA_NO_NODE) {
139 pol = &preferred_node_policy[node];
140 /* preferred_node_policy is not initialised early in boot */
141 if (pol->mode)
142 return pol;
143 }
144
145 return &default_policy;
146 }
147
148 static const struct mempolicy_operations {
149 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
150 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
151 } mpol_ops[MPOL_MAX];
152
153 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
154 {
155 return pol->flags & MPOL_MODE_FLAGS;
156 }
157
158 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
159 const nodemask_t *rel)
160 {
161 nodemask_t tmp;
162 nodes_fold(tmp, *orig, nodes_weight(*rel));
163 nodes_onto(*ret, tmp, *rel);
164 }
165
166 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
167 {
168 if (nodes_empty(*nodes))
169 return -EINVAL;
170 pol->v.nodes = *nodes;
171 return 0;
172 }
173
174 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
175 {
176 if (!nodes)
177 pol->flags |= MPOL_F_LOCAL; /* local allocation */
178 else if (nodes_empty(*nodes))
179 return -EINVAL; /* no allowed nodes */
180 else
181 pol->v.preferred_node = first_node(*nodes);
182 return 0;
183 }
184
185 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
186 {
187 if (nodes_empty(*nodes))
188 return -EINVAL;
189 pol->v.nodes = *nodes;
190 return 0;
191 }
192
193 /*
194 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
195 * any, for the new policy. mpol_new() has already validated the nodes
196 * parameter with respect to the policy mode and flags. But, we need to
197 * handle an empty nodemask with MPOL_PREFERRED here.
198 *
199 * Must be called holding task's alloc_lock to protect task's mems_allowed
200 * and mempolicy. May also be called holding the mmap_semaphore for write.
201 */
202 static int mpol_set_nodemask(struct mempolicy *pol,
203 const nodemask_t *nodes, struct nodemask_scratch *nsc)
204 {
205 int ret;
206
207 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
208 if (pol == NULL)
209 return 0;
210 /* Check N_MEMORY */
211 nodes_and(nsc->mask1,
212 cpuset_current_mems_allowed, node_states[N_MEMORY]);
213
214 VM_BUG_ON(!nodes);
215 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
216 nodes = NULL; /* explicit local allocation */
217 else {
218 if (pol->flags & MPOL_F_RELATIVE_NODES)
219 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
220 else
221 nodes_and(nsc->mask2, *nodes, nsc->mask1);
222
223 if (mpol_store_user_nodemask(pol))
224 pol->w.user_nodemask = *nodes;
225 else
226 pol->w.cpuset_mems_allowed =
227 cpuset_current_mems_allowed;
228 }
229
230 if (nodes)
231 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
232 else
233 ret = mpol_ops[pol->mode].create(pol, NULL);
234 return ret;
235 }
236
237 /*
238 * This function just creates a new policy, does some check and simple
239 * initialization. You must invoke mpol_set_nodemask() to set nodes.
240 */
241 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
242 nodemask_t *nodes)
243 {
244 struct mempolicy *policy;
245
246 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
247 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
248
249 if (mode == MPOL_DEFAULT) {
250 if (nodes && !nodes_empty(*nodes))
251 return ERR_PTR(-EINVAL);
252 return NULL;
253 }
254 VM_BUG_ON(!nodes);
255
256 /*
257 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
258 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
259 * All other modes require a valid pointer to a non-empty nodemask.
260 */
261 if (mode == MPOL_PREFERRED) {
262 if (nodes_empty(*nodes)) {
263 if (((flags & MPOL_F_STATIC_NODES) ||
264 (flags & MPOL_F_RELATIVE_NODES)))
265 return ERR_PTR(-EINVAL);
266 }
267 } else if (mode == MPOL_LOCAL) {
268 if (!nodes_empty(*nodes) ||
269 (flags & MPOL_F_STATIC_NODES) ||
270 (flags & MPOL_F_RELATIVE_NODES))
271 return ERR_PTR(-EINVAL);
272 mode = MPOL_PREFERRED;
273 } else if (nodes_empty(*nodes))
274 return ERR_PTR(-EINVAL);
275 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
276 if (!policy)
277 return ERR_PTR(-ENOMEM);
278 atomic_set(&policy->refcnt, 1);
279 policy->mode = mode;
280 policy->flags = flags;
281
282 return policy;
283 }
284
285 /* Slow path of a mpol destructor. */
286 void __mpol_put(struct mempolicy *p)
287 {
288 if (!atomic_dec_and_test(&p->refcnt))
289 return;
290 kmem_cache_free(policy_cache, p);
291 }
292
293 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
294 {
295 }
296
297 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
298 {
299 nodemask_t tmp;
300
301 if (pol->flags & MPOL_F_STATIC_NODES)
302 nodes_and(tmp, pol->w.user_nodemask, *nodes);
303 else if (pol->flags & MPOL_F_RELATIVE_NODES)
304 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
305 else {
306 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
307 *nodes);
308 pol->w.cpuset_mems_allowed = tmp;
309 }
310
311 if (nodes_empty(tmp))
312 tmp = *nodes;
313
314 pol->v.nodes = tmp;
315 }
316
317 static void mpol_rebind_preferred(struct mempolicy *pol,
318 const nodemask_t *nodes)
319 {
320 nodemask_t tmp;
321
322 if (pol->flags & MPOL_F_STATIC_NODES) {
323 int node = first_node(pol->w.user_nodemask);
324
325 if (node_isset(node, *nodes)) {
326 pol->v.preferred_node = node;
327 pol->flags &= ~MPOL_F_LOCAL;
328 } else
329 pol->flags |= MPOL_F_LOCAL;
330 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
331 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 pol->v.preferred_node = first_node(tmp);
333 } else if (!(pol->flags & MPOL_F_LOCAL)) {
334 pol->v.preferred_node = node_remap(pol->v.preferred_node,
335 pol->w.cpuset_mems_allowed,
336 *nodes);
337 pol->w.cpuset_mems_allowed = *nodes;
338 }
339 }
340
341 /*
342 * mpol_rebind_policy - Migrate a policy to a different set of nodes
343 *
344 * Per-vma policies are protected by mmap_sem. Allocations using per-task
345 * policies are protected by task->mems_allowed_seq to prevent a premature
346 * OOM/allocation failure due to parallel nodemask modification.
347 */
348 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
349 {
350 if (!pol)
351 return;
352 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
353 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
354 return;
355
356 mpol_ops[pol->mode].rebind(pol, newmask);
357 }
358
359 /*
360 * Wrapper for mpol_rebind_policy() that just requires task
361 * pointer, and updates task mempolicy.
362 *
363 * Called with task's alloc_lock held.
364 */
365
366 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
367 {
368 mpol_rebind_policy(tsk->mempolicy, new);
369 }
370
371 /*
372 * Rebind each vma in mm to new nodemask.
373 *
374 * Call holding a reference to mm. Takes mm->mmap_sem during call.
375 */
376
377 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
378 {
379 struct vm_area_struct *vma;
380
381 down_write(&mm->mmap_sem);
382 for (vma = mm->mmap; vma; vma = vma->vm_next)
383 mpol_rebind_policy(vma->vm_policy, new);
384 up_write(&mm->mmap_sem);
385 }
386
387 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
388 [MPOL_DEFAULT] = {
389 .rebind = mpol_rebind_default,
390 },
391 [MPOL_INTERLEAVE] = {
392 .create = mpol_new_interleave,
393 .rebind = mpol_rebind_nodemask,
394 },
395 [MPOL_PREFERRED] = {
396 .create = mpol_new_preferred,
397 .rebind = mpol_rebind_preferred,
398 },
399 [MPOL_BIND] = {
400 .create = mpol_new_bind,
401 .rebind = mpol_rebind_nodemask,
402 },
403 };
404
405 static void migrate_page_add(struct page *page, struct list_head *pagelist,
406 unsigned long flags);
407
408 struct queue_pages {
409 struct list_head *pagelist;
410 unsigned long flags;
411 nodemask_t *nmask;
412 struct vm_area_struct *prev;
413 };
414
415 /*
416 * Check if the page's nid is in qp->nmask.
417 *
418 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
419 * in the invert of qp->nmask.
420 */
421 static inline bool queue_pages_required(struct page *page,
422 struct queue_pages *qp)
423 {
424 int nid = page_to_nid(page);
425 unsigned long flags = qp->flags;
426
427 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
428 }
429
430 /*
431 * queue_pages_pmd() has three possible return values:
432 * 1 - pages are placed on the right node or queued successfully.
433 * 0 - THP was split.
434 * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
435 * page was already on a node that does not follow the policy.
436 */
437 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
438 unsigned long end, struct mm_walk *walk)
439 {
440 int ret = 0;
441 struct page *page;
442 struct queue_pages *qp = walk->private;
443 unsigned long flags;
444
445 if (unlikely(is_pmd_migration_entry(*pmd))) {
446 ret = -EIO;
447 goto unlock;
448 }
449 page = pmd_page(*pmd);
450 if (is_huge_zero_page(page)) {
451 spin_unlock(ptl);
452 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
453 goto out;
454 }
455 if (!thp_migration_supported()) {
456 get_page(page);
457 spin_unlock(ptl);
458 lock_page(page);
459 ret = split_huge_page(page);
460 unlock_page(page);
461 put_page(page);
462 goto out;
463 }
464 if (!queue_pages_required(page, qp)) {
465 ret = 1;
466 goto unlock;
467 }
468
469 ret = 1;
470 flags = qp->flags;
471 /* go to thp migration */
472 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
473 if (!vma_migratable(walk->vma)) {
474 ret = -EIO;
475 goto unlock;
476 }
477
478 migrate_page_add(page, qp->pagelist, flags);
479 } else
480 ret = -EIO;
481 unlock:
482 spin_unlock(ptl);
483 out:
484 return ret;
485 }
486
487 /*
488 * Scan through pages checking if pages follow certain conditions,
489 * and move them to the pagelist if they do.
490 */
491 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
492 unsigned long end, struct mm_walk *walk)
493 {
494 struct vm_area_struct *vma = walk->vma;
495 struct page *page;
496 struct queue_pages *qp = walk->private;
497 unsigned long flags = qp->flags;
498 int ret;
499 pte_t *pte;
500 spinlock_t *ptl;
501
502 ptl = pmd_trans_huge_lock(pmd, vma);
503 if (ptl) {
504 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
505 if (ret > 0)
506 return 0;
507 else if (ret < 0)
508 return ret;
509 }
510
511 if (pmd_trans_unstable(pmd))
512 return 0;
513 retry:
514 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
515 for (; addr != end; pte++, addr += PAGE_SIZE) {
516 if (!pte_present(*pte))
517 continue;
518 page = vm_normal_page(vma, addr, *pte);
519 if (!page)
520 continue;
521 /*
522 * vm_normal_page() filters out zero pages, but there might
523 * still be PageReserved pages to skip, perhaps in a VDSO.
524 */
525 if (PageReserved(page))
526 continue;
527 if (!queue_pages_required(page, qp))
528 continue;
529 if (PageTransCompound(page) && !thp_migration_supported()) {
530 get_page(page);
531 pte_unmap_unlock(pte, ptl);
532 lock_page(page);
533 ret = split_huge_page(page);
534 unlock_page(page);
535 put_page(page);
536 /* Failed to split -- skip. */
537 if (ret) {
538 pte = pte_offset_map_lock(walk->mm, pmd,
539 addr, &ptl);
540 continue;
541 }
542 goto retry;
543 }
544
545 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
546 if (!vma_migratable(vma))
547 break;
548 migrate_page_add(page, qp->pagelist, flags);
549 } else
550 break;
551 }
552 pte_unmap_unlock(pte - 1, ptl);
553 cond_resched();
554 return addr != end ? -EIO : 0;
555 }
556
557 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 unsigned long addr, unsigned long end,
559 struct mm_walk *walk)
560 {
561 #ifdef CONFIG_HUGETLB_PAGE
562 struct queue_pages *qp = walk->private;
563 unsigned long flags = qp->flags;
564 struct page *page;
565 spinlock_t *ptl;
566 pte_t entry;
567
568 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
569 entry = huge_ptep_get(pte);
570 if (!pte_present(entry))
571 goto unlock;
572 page = pte_page(entry);
573 if (!queue_pages_required(page, qp))
574 goto unlock;
575 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
576 if (flags & (MPOL_MF_MOVE_ALL) ||
577 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
578 isolate_huge_page(page, qp->pagelist);
579 unlock:
580 spin_unlock(ptl);
581 #else
582 BUG();
583 #endif
584 return 0;
585 }
586
587 #ifdef CONFIG_NUMA_BALANCING
588 /*
589 * This is used to mark a range of virtual addresses to be inaccessible.
590 * These are later cleared by a NUMA hinting fault. Depending on these
591 * faults, pages may be migrated for better NUMA placement.
592 *
593 * This is assuming that NUMA faults are handled using PROT_NONE. If
594 * an architecture makes a different choice, it will need further
595 * changes to the core.
596 */
597 unsigned long change_prot_numa(struct vm_area_struct *vma,
598 unsigned long addr, unsigned long end)
599 {
600 int nr_updated;
601
602 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
603 if (nr_updated)
604 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
605
606 return nr_updated;
607 }
608 #else
609 static unsigned long change_prot_numa(struct vm_area_struct *vma,
610 unsigned long addr, unsigned long end)
611 {
612 return 0;
613 }
614 #endif /* CONFIG_NUMA_BALANCING */
615
616 static int queue_pages_test_walk(unsigned long start, unsigned long end,
617 struct mm_walk *walk)
618 {
619 struct vm_area_struct *vma = walk->vma;
620 struct queue_pages *qp = walk->private;
621 unsigned long endvma = vma->vm_end;
622 unsigned long flags = qp->flags;
623
624 /*
625 * Need check MPOL_MF_STRICT to return -EIO if possible
626 * regardless of vma_migratable
627 */
628 if (!vma_migratable(vma) &&
629 !(flags & MPOL_MF_STRICT))
630 return 1;
631
632 if (endvma > end)
633 endvma = end;
634 if (vma->vm_start > start)
635 start = vma->vm_start;
636
637 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
638 if (!vma->vm_next && vma->vm_end < end)
639 return -EFAULT;
640 if (qp->prev && qp->prev->vm_end < vma->vm_start)
641 return -EFAULT;
642 }
643
644 qp->prev = vma;
645
646 if (flags & MPOL_MF_LAZY) {
647 /* Similar to task_numa_work, skip inaccessible VMAs */
648 if (!is_vm_hugetlb_page(vma) &&
649 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
650 !(vma->vm_flags & VM_MIXEDMAP))
651 change_prot_numa(vma, start, endvma);
652 return 1;
653 }
654
655 /* queue pages from current vma */
656 if (flags & MPOL_MF_VALID)
657 return 0;
658 return 1;
659 }
660
661 /*
662 * Walk through page tables and collect pages to be migrated.
663 *
664 * If pages found in a given range are on a set of nodes (determined by
665 * @nodes and @flags,) it's isolated and queued to the pagelist which is
666 * passed via @private.)
667 */
668 static int
669 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
670 nodemask_t *nodes, unsigned long flags,
671 struct list_head *pagelist)
672 {
673 struct queue_pages qp = {
674 .pagelist = pagelist,
675 .flags = flags,
676 .nmask = nodes,
677 .prev = NULL,
678 };
679 struct mm_walk queue_pages_walk = {
680 .hugetlb_entry = queue_pages_hugetlb,
681 .pmd_entry = queue_pages_pte_range,
682 .test_walk = queue_pages_test_walk,
683 .mm = mm,
684 .private = &qp,
685 };
686
687 return walk_page_range(start, end, &queue_pages_walk);
688 }
689
690 /*
691 * Apply policy to a single VMA
692 * This must be called with the mmap_sem held for writing.
693 */
694 static int vma_replace_policy(struct vm_area_struct *vma,
695 struct mempolicy *pol)
696 {
697 int err;
698 struct mempolicy *old;
699 struct mempolicy *new;
700
701 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
702 vma->vm_start, vma->vm_end, vma->vm_pgoff,
703 vma->vm_ops, vma->vm_file,
704 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
705
706 new = mpol_dup(pol);
707 if (IS_ERR(new))
708 return PTR_ERR(new);
709
710 if (vma->vm_ops && vma->vm_ops->set_policy) {
711 err = vma->vm_ops->set_policy(vma, new);
712 if (err)
713 goto err_out;
714 }
715
716 old = vma->vm_policy;
717 vma->vm_policy = new; /* protected by mmap_sem */
718 mpol_put(old);
719
720 return 0;
721 err_out:
722 mpol_put(new);
723 return err;
724 }
725
726 /* Step 2: apply policy to a range and do splits. */
727 static int mbind_range(struct mm_struct *mm, unsigned long start,
728 unsigned long end, struct mempolicy *new_pol)
729 {
730 struct vm_area_struct *next;
731 struct vm_area_struct *prev;
732 struct vm_area_struct *vma;
733 int err = 0;
734 pgoff_t pgoff;
735 unsigned long vmstart;
736 unsigned long vmend;
737
738 vma = find_vma(mm, start);
739 if (!vma || vma->vm_start > start)
740 return -EFAULT;
741
742 prev = vma->vm_prev;
743 if (start > vma->vm_start)
744 prev = vma;
745
746 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
747 next = vma->vm_next;
748 vmstart = max(start, vma->vm_start);
749 vmend = min(end, vma->vm_end);
750
751 if (mpol_equal(vma_policy(vma), new_pol))
752 continue;
753
754 pgoff = vma->vm_pgoff +
755 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
756 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
757 vma->anon_vma, vma->vm_file, pgoff,
758 new_pol, vma->vm_userfaultfd_ctx,
759 vma_get_anon_name(vma));
760 if (prev) {
761 vma = prev;
762 next = vma->vm_next;
763 if (mpol_equal(vma_policy(vma), new_pol))
764 continue;
765 /* vma_merge() joined vma && vma->next, case 8 */
766 goto replace;
767 }
768 if (vma->vm_start != vmstart) {
769 err = split_vma(vma->vm_mm, vma, vmstart, 1);
770 if (err)
771 goto out;
772 }
773 if (vma->vm_end != vmend) {
774 err = split_vma(vma->vm_mm, vma, vmend, 0);
775 if (err)
776 goto out;
777 }
778 replace:
779 err = vma_replace_policy(vma, new_pol);
780 if (err)
781 goto out;
782 }
783
784 out:
785 return err;
786 }
787
788 /* Set the process memory policy */
789 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
790 nodemask_t *nodes)
791 {
792 struct mempolicy *new, *old;
793 NODEMASK_SCRATCH(scratch);
794 int ret;
795
796 if (!scratch)
797 return -ENOMEM;
798
799 new = mpol_new(mode, flags, nodes);
800 if (IS_ERR(new)) {
801 ret = PTR_ERR(new);
802 goto out;
803 }
804
805 task_lock(current);
806 ret = mpol_set_nodemask(new, nodes, scratch);
807 if (ret) {
808 task_unlock(current);
809 mpol_put(new);
810 goto out;
811 }
812 old = current->mempolicy;
813 current->mempolicy = new;
814 if (new && new->mode == MPOL_INTERLEAVE)
815 current->il_prev = MAX_NUMNODES-1;
816 task_unlock(current);
817 mpol_put(old);
818 ret = 0;
819 out:
820 NODEMASK_SCRATCH_FREE(scratch);
821 return ret;
822 }
823
824 /*
825 * Return nodemask for policy for get_mempolicy() query
826 *
827 * Called with task's alloc_lock held
828 */
829 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
830 {
831 nodes_clear(*nodes);
832 if (p == &default_policy)
833 return;
834
835 switch (p->mode) {
836 case MPOL_BIND:
837 /* Fall through */
838 case MPOL_INTERLEAVE:
839 *nodes = p->v.nodes;
840 break;
841 case MPOL_PREFERRED:
842 if (!(p->flags & MPOL_F_LOCAL))
843 node_set(p->v.preferred_node, *nodes);
844 /* else return empty node mask for local allocation */
845 break;
846 default:
847 BUG();
848 }
849 }
850
851 static int lookup_node(unsigned long addr)
852 {
853 struct page *p;
854 int err;
855
856 err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
857 if (err >= 0) {
858 err = page_to_nid(p);
859 put_page(p);
860 }
861 return err;
862 }
863
864 /* Retrieve NUMA policy */
865 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
866 unsigned long addr, unsigned long flags)
867 {
868 int err;
869 struct mm_struct *mm = current->mm;
870 struct vm_area_struct *vma = NULL;
871 struct mempolicy *pol = current->mempolicy;
872
873 if (flags &
874 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
875 return -EINVAL;
876
877 if (flags & MPOL_F_MEMS_ALLOWED) {
878 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
879 return -EINVAL;
880 *policy = 0; /* just so it's initialized */
881 task_lock(current);
882 *nmask = cpuset_current_mems_allowed;
883 task_unlock(current);
884 return 0;
885 }
886
887 if (flags & MPOL_F_ADDR) {
888 /*
889 * Do NOT fall back to task policy if the
890 * vma/shared policy at addr is NULL. We
891 * want to return MPOL_DEFAULT in this case.
892 */
893 down_read(&mm->mmap_sem);
894 vma = find_vma_intersection(mm, addr, addr+1);
895 if (!vma) {
896 up_read(&mm->mmap_sem);
897 return -EFAULT;
898 }
899 if (vma->vm_ops && vma->vm_ops->get_policy)
900 pol = vma->vm_ops->get_policy(vma, addr);
901 else
902 pol = vma->vm_policy;
903 } else if (addr)
904 return -EINVAL;
905
906 if (!pol)
907 pol = &default_policy; /* indicates default behavior */
908
909 if (flags & MPOL_F_NODE) {
910 if (flags & MPOL_F_ADDR) {
911 err = lookup_node(addr);
912 if (err < 0)
913 goto out;
914 *policy = err;
915 } else if (pol == current->mempolicy &&
916 pol->mode == MPOL_INTERLEAVE) {
917 *policy = next_node_in(current->il_prev, pol->v.nodes);
918 } else {
919 err = -EINVAL;
920 goto out;
921 }
922 } else {
923 *policy = pol == &default_policy ? MPOL_DEFAULT :
924 pol->mode;
925 /*
926 * Internal mempolicy flags must be masked off before exposing
927 * the policy to userspace.
928 */
929 *policy |= (pol->flags & MPOL_MODE_FLAGS);
930 }
931
932 err = 0;
933 if (nmask) {
934 if (mpol_store_user_nodemask(pol)) {
935 *nmask = pol->w.user_nodemask;
936 } else {
937 task_lock(current);
938 get_policy_nodemask(pol, nmask);
939 task_unlock(current);
940 }
941 }
942
943 out:
944 mpol_cond_put(pol);
945 if (vma)
946 up_read(&current->mm->mmap_sem);
947 return err;
948 }
949
950 #ifdef CONFIG_MIGRATION
951 /*
952 * page migration, thp tail pages can be passed.
953 */
954 static void migrate_page_add(struct page *page, struct list_head *pagelist,
955 unsigned long flags)
956 {
957 struct page *head = compound_head(page);
958 /*
959 * Avoid migrating a page that is shared with others.
960 */
961 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
962 if (!isolate_lru_page(head)) {
963 list_add_tail(&head->lru, pagelist);
964 mod_node_page_state(page_pgdat(head),
965 NR_ISOLATED_ANON + page_is_file_cache(head),
966 hpage_nr_pages(head));
967 }
968 }
969 }
970
971 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
972 {
973 if (PageHuge(page))
974 return alloc_huge_page_node(page_hstate(compound_head(page)),
975 node);
976 else if (thp_migration_supported() && PageTransHuge(page)) {
977 struct page *thp;
978
979 thp = alloc_pages_node(node,
980 (GFP_TRANSHUGE | __GFP_THISNODE),
981 HPAGE_PMD_ORDER);
982 if (!thp)
983 return NULL;
984 prep_transhuge_page(thp);
985 return thp;
986 } else
987 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
988 __GFP_THISNODE, 0);
989 }
990
991 /*
992 * Migrate pages from one node to a target node.
993 * Returns error or the number of pages not migrated.
994 */
995 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
996 int flags)
997 {
998 nodemask_t nmask;
999 LIST_HEAD(pagelist);
1000 int err = 0;
1001
1002 nodes_clear(nmask);
1003 node_set(source, nmask);
1004
1005 /*
1006 * This does not "check" the range but isolates all pages that
1007 * need migration. Between passing in the full user address
1008 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1009 */
1010 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1011 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1012 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1013
1014 if (!list_empty(&pagelist)) {
1015 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1016 MIGRATE_SYNC, MR_SYSCALL);
1017 if (err)
1018 putback_movable_pages(&pagelist);
1019 }
1020
1021 return err;
1022 }
1023
1024 /*
1025 * Move pages between the two nodesets so as to preserve the physical
1026 * layout as much as possible.
1027 *
1028 * Returns the number of page that could not be moved.
1029 */
1030 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1031 const nodemask_t *to, int flags)
1032 {
1033 int busy = 0;
1034 int err;
1035 nodemask_t tmp;
1036
1037 err = migrate_prep();
1038 if (err)
1039 return err;
1040
1041 down_read(&mm->mmap_sem);
1042
1043 /*
1044 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1045 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1046 * bit in 'tmp', and return that <source, dest> pair for migration.
1047 * The pair of nodemasks 'to' and 'from' define the map.
1048 *
1049 * If no pair of bits is found that way, fallback to picking some
1050 * pair of 'source' and 'dest' bits that are not the same. If the
1051 * 'source' and 'dest' bits are the same, this represents a node
1052 * that will be migrating to itself, so no pages need move.
1053 *
1054 * If no bits are left in 'tmp', or if all remaining bits left
1055 * in 'tmp' correspond to the same bit in 'to', return false
1056 * (nothing left to migrate).
1057 *
1058 * This lets us pick a pair of nodes to migrate between, such that
1059 * if possible the dest node is not already occupied by some other
1060 * source node, minimizing the risk of overloading the memory on a
1061 * node that would happen if we migrated incoming memory to a node
1062 * before migrating outgoing memory source that same node.
1063 *
1064 * A single scan of tmp is sufficient. As we go, we remember the
1065 * most recent <s, d> pair that moved (s != d). If we find a pair
1066 * that not only moved, but what's better, moved to an empty slot
1067 * (d is not set in tmp), then we break out then, with that pair.
1068 * Otherwise when we finish scanning from_tmp, we at least have the
1069 * most recent <s, d> pair that moved. If we get all the way through
1070 * the scan of tmp without finding any node that moved, much less
1071 * moved to an empty node, then there is nothing left worth migrating.
1072 */
1073
1074 tmp = *from;
1075 while (!nodes_empty(tmp)) {
1076 int s,d;
1077 int source = NUMA_NO_NODE;
1078 int dest = 0;
1079
1080 for_each_node_mask(s, tmp) {
1081
1082 /*
1083 * do_migrate_pages() tries to maintain the relative
1084 * node relationship of the pages established between
1085 * threads and memory areas.
1086 *
1087 * However if the number of source nodes is not equal to
1088 * the number of destination nodes we can not preserve
1089 * this node relative relationship. In that case, skip
1090 * copying memory from a node that is in the destination
1091 * mask.
1092 *
1093 * Example: [2,3,4] -> [3,4,5] moves everything.
1094 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1095 */
1096
1097 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1098 (node_isset(s, *to)))
1099 continue;
1100
1101 d = node_remap(s, *from, *to);
1102 if (s == d)
1103 continue;
1104
1105 source = s; /* Node moved. Memorize */
1106 dest = d;
1107
1108 /* dest not in remaining from nodes? */
1109 if (!node_isset(dest, tmp))
1110 break;
1111 }
1112 if (source == NUMA_NO_NODE)
1113 break;
1114
1115 node_clear(source, tmp);
1116 err = migrate_to_node(mm, source, dest, flags);
1117 if (err > 0)
1118 busy += err;
1119 if (err < 0)
1120 break;
1121 }
1122 up_read(&mm->mmap_sem);
1123 if (err < 0)
1124 return err;
1125 return busy;
1126
1127 }
1128
1129 /*
1130 * Allocate a new page for page migration based on vma policy.
1131 * Start by assuming the page is mapped by the same vma as contains @start.
1132 * Search forward from there, if not. N.B., this assumes that the
1133 * list of pages handed to migrate_pages()--which is how we get here--
1134 * is in virtual address order.
1135 */
1136 static struct page *new_page(struct page *page, unsigned long start, int **x)
1137 {
1138 struct vm_area_struct *vma;
1139 unsigned long uninitialized_var(address);
1140
1141 vma = find_vma(current->mm, start);
1142 while (vma) {
1143 address = page_address_in_vma(page, vma);
1144 if (address != -EFAULT)
1145 break;
1146 vma = vma->vm_next;
1147 }
1148
1149 if (PageHuge(page)) {
1150 BUG_ON(!vma);
1151 return alloc_huge_page_noerr(vma, address, 1);
1152 } else if (thp_migration_supported() && PageTransHuge(page)) {
1153 struct page *thp;
1154
1155 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1156 HPAGE_PMD_ORDER);
1157 if (!thp)
1158 return NULL;
1159 prep_transhuge_page(thp);
1160 return thp;
1161 }
1162 /*
1163 * if !vma, alloc_page_vma() will use task or system default policy
1164 */
1165 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1166 vma, address);
1167 }
1168 #else
1169
1170 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1171 unsigned long flags)
1172 {
1173 }
1174
1175 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1176 const nodemask_t *to, int flags)
1177 {
1178 return -ENOSYS;
1179 }
1180
1181 static struct page *new_page(struct page *page, unsigned long start, int **x)
1182 {
1183 return NULL;
1184 }
1185 #endif
1186
1187 static long do_mbind(unsigned long start, unsigned long len,
1188 unsigned short mode, unsigned short mode_flags,
1189 nodemask_t *nmask, unsigned long flags)
1190 {
1191 struct mm_struct *mm = current->mm;
1192 struct mempolicy *new;
1193 unsigned long end;
1194 int err;
1195 LIST_HEAD(pagelist);
1196
1197 if (flags & ~(unsigned long)MPOL_MF_VALID)
1198 return -EINVAL;
1199 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1200 return -EPERM;
1201
1202 if (start & ~PAGE_MASK)
1203 return -EINVAL;
1204
1205 if (mode == MPOL_DEFAULT)
1206 flags &= ~MPOL_MF_STRICT;
1207
1208 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1209 end = start + len;
1210
1211 if (end < start)
1212 return -EINVAL;
1213 if (end == start)
1214 return 0;
1215
1216 new = mpol_new(mode, mode_flags, nmask);
1217 if (IS_ERR(new))
1218 return PTR_ERR(new);
1219
1220 if (flags & MPOL_MF_LAZY)
1221 new->flags |= MPOL_F_MOF;
1222
1223 /*
1224 * If we are using the default policy then operation
1225 * on discontinuous address spaces is okay after all
1226 */
1227 if (!new)
1228 flags |= MPOL_MF_DISCONTIG_OK;
1229
1230 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1231 start, start + len, mode, mode_flags,
1232 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1233
1234 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1235
1236 err = migrate_prep();
1237 if (err)
1238 goto mpol_out;
1239 }
1240 {
1241 NODEMASK_SCRATCH(scratch);
1242 if (scratch) {
1243 down_write(&mm->mmap_sem);
1244 task_lock(current);
1245 err = mpol_set_nodemask(new, nmask, scratch);
1246 task_unlock(current);
1247 if (err)
1248 up_write(&mm->mmap_sem);
1249 } else
1250 err = -ENOMEM;
1251 NODEMASK_SCRATCH_FREE(scratch);
1252 }
1253 if (err)
1254 goto mpol_out;
1255
1256 err = queue_pages_range(mm, start, end, nmask,
1257 flags | MPOL_MF_INVERT, &pagelist);
1258 if (!err)
1259 err = mbind_range(mm, start, end, new);
1260
1261 if (!err) {
1262 int nr_failed = 0;
1263
1264 if (!list_empty(&pagelist)) {
1265 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1266 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1267 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1268 if (nr_failed)
1269 putback_movable_pages(&pagelist);
1270 }
1271
1272 if (nr_failed && (flags & MPOL_MF_STRICT))
1273 err = -EIO;
1274 } else
1275 putback_movable_pages(&pagelist);
1276
1277 up_write(&mm->mmap_sem);
1278 mpol_out:
1279 mpol_put(new);
1280 return err;
1281 }
1282
1283 /*
1284 * User space interface with variable sized bitmaps for nodelists.
1285 */
1286
1287 /* Copy a node mask from user space. */
1288 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1289 unsigned long maxnode)
1290 {
1291 unsigned long k;
1292 unsigned long t;
1293 unsigned long nlongs;
1294 unsigned long endmask;
1295
1296 --maxnode;
1297 nodes_clear(*nodes);
1298 if (maxnode == 0 || !nmask)
1299 return 0;
1300 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1301 return -EINVAL;
1302
1303 nlongs = BITS_TO_LONGS(maxnode);
1304 if ((maxnode % BITS_PER_LONG) == 0)
1305 endmask = ~0UL;
1306 else
1307 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1308
1309 /*
1310 * When the user specified more nodes than supported just check
1311 * if the non supported part is all zero.
1312 *
1313 * If maxnode have more longs than MAX_NUMNODES, check
1314 * the bits in that area first. And then go through to
1315 * check the rest bits which equal or bigger than MAX_NUMNODES.
1316 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1317 */
1318 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1319 if (nlongs > PAGE_SIZE/sizeof(long))
1320 return -EINVAL;
1321 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1322 if (get_user(t, nmask + k))
1323 return -EFAULT;
1324 if (k == nlongs - 1) {
1325 if (t & endmask)
1326 return -EINVAL;
1327 } else if (t)
1328 return -EINVAL;
1329 }
1330 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1331 endmask = ~0UL;
1332 }
1333
1334 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1335 unsigned long valid_mask = endmask;
1336
1337 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1338 if (get_user(t, nmask + nlongs - 1))
1339 return -EFAULT;
1340 if (t & valid_mask)
1341 return -EINVAL;
1342 }
1343
1344 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1345 return -EFAULT;
1346 nodes_addr(*nodes)[nlongs-1] &= endmask;
1347 return 0;
1348 }
1349
1350 /* Copy a kernel node mask to user space */
1351 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1352 nodemask_t *nodes)
1353 {
1354 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1355 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1356
1357 if (copy > nbytes) {
1358 if (copy > PAGE_SIZE)
1359 return -EINVAL;
1360 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1361 return -EFAULT;
1362 copy = nbytes;
1363 }
1364 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1365 }
1366
1367 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1368 unsigned long, mode, const unsigned long __user *, nmask,
1369 unsigned long, maxnode, unsigned, flags)
1370 {
1371 nodemask_t nodes;
1372 int err;
1373 unsigned short mode_flags;
1374
1375 mode_flags = mode & MPOL_MODE_FLAGS;
1376 mode &= ~MPOL_MODE_FLAGS;
1377 if (mode >= MPOL_MAX)
1378 return -EINVAL;
1379 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1380 (mode_flags & MPOL_F_RELATIVE_NODES))
1381 return -EINVAL;
1382 err = get_nodes(&nodes, nmask, maxnode);
1383 if (err)
1384 return err;
1385 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1386 }
1387
1388 /* Set the process memory policy */
1389 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1390 unsigned long, maxnode)
1391 {
1392 int err;
1393 nodemask_t nodes;
1394 unsigned short flags;
1395
1396 flags = mode & MPOL_MODE_FLAGS;
1397 mode &= ~MPOL_MODE_FLAGS;
1398 if ((unsigned int)mode >= MPOL_MAX)
1399 return -EINVAL;
1400 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1401 return -EINVAL;
1402 err = get_nodes(&nodes, nmask, maxnode);
1403 if (err)
1404 return err;
1405 return do_set_mempolicy(mode, flags, &nodes);
1406 }
1407
1408 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1409 const unsigned long __user *, old_nodes,
1410 const unsigned long __user *, new_nodes)
1411 {
1412 const struct cred *cred = current_cred(), *tcred;
1413 struct mm_struct *mm = NULL;
1414 struct task_struct *task;
1415 nodemask_t task_nodes;
1416 int err;
1417 nodemask_t *old;
1418 nodemask_t *new;
1419 NODEMASK_SCRATCH(scratch);
1420
1421 if (!scratch)
1422 return -ENOMEM;
1423
1424 old = &scratch->mask1;
1425 new = &scratch->mask2;
1426
1427 err = get_nodes(old, old_nodes, maxnode);
1428 if (err)
1429 goto out;
1430
1431 err = get_nodes(new, new_nodes, maxnode);
1432 if (err)
1433 goto out;
1434
1435 /* Find the mm_struct */
1436 rcu_read_lock();
1437 task = pid ? find_task_by_vpid(pid) : current;
1438 if (!task) {
1439 rcu_read_unlock();
1440 err = -ESRCH;
1441 goto out;
1442 }
1443 get_task_struct(task);
1444
1445 err = -EINVAL;
1446
1447 /*
1448 * Check if this process has the right to modify the specified
1449 * process. The right exists if the process has administrative
1450 * capabilities, superuser privileges or the same
1451 * userid as the target process.
1452 */
1453 tcred = __task_cred(task);
1454 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1455 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1456 !capable(CAP_SYS_NICE)) {
1457 rcu_read_unlock();
1458 err = -EPERM;
1459 goto out_put;
1460 }
1461 rcu_read_unlock();
1462
1463 task_nodes = cpuset_mems_allowed(task);
1464 /* Is the user allowed to access the target nodes? */
1465 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1466 err = -EPERM;
1467 goto out_put;
1468 }
1469
1470 task_nodes = cpuset_mems_allowed(current);
1471 nodes_and(*new, *new, task_nodes);
1472 if (nodes_empty(*new))
1473 goto out_put;
1474
1475 nodes_and(*new, *new, node_states[N_MEMORY]);
1476 if (nodes_empty(*new))
1477 goto out_put;
1478
1479 err = security_task_movememory(task);
1480 if (err)
1481 goto out_put;
1482
1483 mm = get_task_mm(task);
1484 put_task_struct(task);
1485
1486 if (!mm) {
1487 err = -EINVAL;
1488 goto out;
1489 }
1490
1491 err = do_migrate_pages(mm, old, new,
1492 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1493
1494 mmput(mm);
1495 out:
1496 NODEMASK_SCRATCH_FREE(scratch);
1497
1498 return err;
1499
1500 out_put:
1501 put_task_struct(task);
1502 goto out;
1503
1504 }
1505
1506
1507 /* Retrieve NUMA policy */
1508 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1509 unsigned long __user *, nmask, unsigned long, maxnode,
1510 unsigned long, addr, unsigned long, flags)
1511 {
1512 int err;
1513 int uninitialized_var(pval);
1514 nodemask_t nodes;
1515
1516 if (nmask != NULL && maxnode < nr_node_ids)
1517 return -EINVAL;
1518
1519 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1520
1521 if (err)
1522 return err;
1523
1524 if (policy && put_user(pval, policy))
1525 return -EFAULT;
1526
1527 if (nmask)
1528 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1529
1530 return err;
1531 }
1532
1533 #ifdef CONFIG_COMPAT
1534
1535 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1536 compat_ulong_t __user *, nmask,
1537 compat_ulong_t, maxnode,
1538 compat_ulong_t, addr, compat_ulong_t, flags)
1539 {
1540 long err;
1541 unsigned long __user *nm = NULL;
1542 unsigned long nr_bits, alloc_size;
1543 DECLARE_BITMAP(bm, MAX_NUMNODES);
1544
1545 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1546 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1547
1548 if (nmask)
1549 nm = compat_alloc_user_space(alloc_size);
1550
1551 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1552
1553 if (!err && nmask) {
1554 unsigned long copy_size;
1555 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1556 err = copy_from_user(bm, nm, copy_size);
1557 /* ensure entire bitmap is zeroed */
1558 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1559 err |= compat_put_bitmap(nmask, bm, nr_bits);
1560 }
1561
1562 return err;
1563 }
1564
1565 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1566 compat_ulong_t, maxnode)
1567 {
1568 unsigned long __user *nm = NULL;
1569 unsigned long nr_bits, alloc_size;
1570 DECLARE_BITMAP(bm, MAX_NUMNODES);
1571
1572 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1573 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1574
1575 if (nmask) {
1576 if (compat_get_bitmap(bm, nmask, nr_bits))
1577 return -EFAULT;
1578 nm = compat_alloc_user_space(alloc_size);
1579 if (copy_to_user(nm, bm, alloc_size))
1580 return -EFAULT;
1581 }
1582
1583 return sys_set_mempolicy(mode, nm, nr_bits+1);
1584 }
1585
1586 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1587 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1588 compat_ulong_t, maxnode, compat_ulong_t, flags)
1589 {
1590 unsigned long __user *nm = NULL;
1591 unsigned long nr_bits, alloc_size;
1592 nodemask_t bm;
1593
1594 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1595 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1596
1597 if (nmask) {
1598 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1599 return -EFAULT;
1600 nm = compat_alloc_user_space(alloc_size);
1601 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1602 return -EFAULT;
1603 }
1604
1605 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1606 }
1607
1608 #endif
1609
1610 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1611 unsigned long addr)
1612 {
1613 struct mempolicy *pol = NULL;
1614
1615 if (vma) {
1616 if (vma->vm_ops && vma->vm_ops->get_policy) {
1617 pol = vma->vm_ops->get_policy(vma, addr);
1618 } else if (vma->vm_policy) {
1619 pol = vma->vm_policy;
1620
1621 /*
1622 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1623 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1624 * count on these policies which will be dropped by
1625 * mpol_cond_put() later
1626 */
1627 if (mpol_needs_cond_ref(pol))
1628 mpol_get(pol);
1629 }
1630 }
1631
1632 return pol;
1633 }
1634
1635 /*
1636 * get_vma_policy(@vma, @addr)
1637 * @vma: virtual memory area whose policy is sought
1638 * @addr: address in @vma for shared policy lookup
1639 *
1640 * Returns effective policy for a VMA at specified address.
1641 * Falls back to current->mempolicy or system default policy, as necessary.
1642 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1643 * count--added by the get_policy() vm_op, as appropriate--to protect against
1644 * freeing by another task. It is the caller's responsibility to free the
1645 * extra reference for shared policies.
1646 */
1647 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1648 unsigned long addr)
1649 {
1650 struct mempolicy *pol = __get_vma_policy(vma, addr);
1651
1652 if (!pol)
1653 pol = get_task_policy(current);
1654
1655 return pol;
1656 }
1657
1658 bool vma_policy_mof(struct vm_area_struct *vma)
1659 {
1660 struct mempolicy *pol;
1661
1662 if (vma->vm_ops && vma->vm_ops->get_policy) {
1663 bool ret = false;
1664
1665 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1666 if (pol && (pol->flags & MPOL_F_MOF))
1667 ret = true;
1668 mpol_cond_put(pol);
1669
1670 return ret;
1671 }
1672
1673 pol = vma->vm_policy;
1674 if (!pol)
1675 pol = get_task_policy(current);
1676
1677 return pol->flags & MPOL_F_MOF;
1678 }
1679
1680 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1681 {
1682 enum zone_type dynamic_policy_zone = policy_zone;
1683
1684 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1685
1686 /*
1687 * if policy->v.nodes has movable memory only,
1688 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1689 *
1690 * policy->v.nodes is intersect with node_states[N_MEMORY].
1691 * so if the following test faile, it implies
1692 * policy->v.nodes has movable memory only.
1693 */
1694 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1695 dynamic_policy_zone = ZONE_MOVABLE;
1696
1697 return zone >= dynamic_policy_zone;
1698 }
1699
1700 /*
1701 * Return a nodemask representing a mempolicy for filtering nodes for
1702 * page allocation
1703 */
1704 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1705 {
1706 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1707 if (unlikely(policy->mode == MPOL_BIND) &&
1708 apply_policy_zone(policy, gfp_zone(gfp)) &&
1709 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1710 return &policy->v.nodes;
1711
1712 return NULL;
1713 }
1714
1715 /* Return the node id preferred by the given mempolicy, or the given id */
1716 static int policy_node(gfp_t gfp, struct mempolicy *policy,
1717 int nd)
1718 {
1719 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1720 nd = policy->v.preferred_node;
1721 else {
1722 /*
1723 * __GFP_THISNODE shouldn't even be used with the bind policy
1724 * because we might easily break the expectation to stay on the
1725 * requested node and not break the policy.
1726 */
1727 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1728 }
1729
1730 return nd;
1731 }
1732
1733 /* Do dynamic interleaving for a process */
1734 static unsigned interleave_nodes(struct mempolicy *policy)
1735 {
1736 unsigned next;
1737 struct task_struct *me = current;
1738
1739 next = next_node_in(me->il_prev, policy->v.nodes);
1740 if (next < MAX_NUMNODES)
1741 me->il_prev = next;
1742 return next;
1743 }
1744
1745 /*
1746 * Depending on the memory policy provide a node from which to allocate the
1747 * next slab entry.
1748 */
1749 unsigned int mempolicy_slab_node(void)
1750 {
1751 struct mempolicy *policy;
1752 int node = numa_mem_id();
1753
1754 if (in_interrupt())
1755 return node;
1756
1757 policy = current->mempolicy;
1758 if (!policy || policy->flags & MPOL_F_LOCAL)
1759 return node;
1760
1761 switch (policy->mode) {
1762 case MPOL_PREFERRED:
1763 /*
1764 * handled MPOL_F_LOCAL above
1765 */
1766 return policy->v.preferred_node;
1767
1768 case MPOL_INTERLEAVE:
1769 return interleave_nodes(policy);
1770
1771 case MPOL_BIND: {
1772 struct zoneref *z;
1773
1774 /*
1775 * Follow bind policy behavior and start allocation at the
1776 * first node.
1777 */
1778 struct zonelist *zonelist;
1779 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1780 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1781 z = first_zones_zonelist(zonelist, highest_zoneidx,
1782 &policy->v.nodes);
1783 return z->zone ? z->zone->node : node;
1784 }
1785
1786 default:
1787 BUG();
1788 }
1789 }
1790
1791 /*
1792 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1793 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1794 * number of present nodes.
1795 */
1796 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1797 {
1798 unsigned nnodes = nodes_weight(pol->v.nodes);
1799 unsigned target;
1800 int i;
1801 int nid;
1802
1803 if (!nnodes)
1804 return numa_node_id();
1805 target = (unsigned int)n % nnodes;
1806 nid = first_node(pol->v.nodes);
1807 for (i = 0; i < target; i++)
1808 nid = next_node(nid, pol->v.nodes);
1809 return nid;
1810 }
1811
1812 /* Determine a node number for interleave */
1813 static inline unsigned interleave_nid(struct mempolicy *pol,
1814 struct vm_area_struct *vma, unsigned long addr, int shift)
1815 {
1816 if (vma) {
1817 unsigned long off;
1818
1819 /*
1820 * for small pages, there is no difference between
1821 * shift and PAGE_SHIFT, so the bit-shift is safe.
1822 * for huge pages, since vm_pgoff is in units of small
1823 * pages, we need to shift off the always 0 bits to get
1824 * a useful offset.
1825 */
1826 BUG_ON(shift < PAGE_SHIFT);
1827 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1828 off += (addr - vma->vm_start) >> shift;
1829 return offset_il_node(pol, off);
1830 } else
1831 return interleave_nodes(pol);
1832 }
1833
1834 #ifdef CONFIG_HUGETLBFS
1835 /*
1836 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1837 * @vma: virtual memory area whose policy is sought
1838 * @addr: address in @vma for shared policy lookup and interleave policy
1839 * @gfp_flags: for requested zone
1840 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1841 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1842 *
1843 * Returns a nid suitable for a huge page allocation and a pointer
1844 * to the struct mempolicy for conditional unref after allocation.
1845 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1846 * @nodemask for filtering the zonelist.
1847 *
1848 * Must be protected by read_mems_allowed_begin()
1849 */
1850 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1851 struct mempolicy **mpol, nodemask_t **nodemask)
1852 {
1853 int nid;
1854
1855 *mpol = get_vma_policy(vma, addr);
1856 *nodemask = NULL; /* assume !MPOL_BIND */
1857
1858 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1859 nid = interleave_nid(*mpol, vma, addr,
1860 huge_page_shift(hstate_vma(vma)));
1861 } else {
1862 nid = policy_node(gfp_flags, *mpol, numa_node_id());
1863 if ((*mpol)->mode == MPOL_BIND)
1864 *nodemask = &(*mpol)->v.nodes;
1865 }
1866 return nid;
1867 }
1868
1869 /*
1870 * init_nodemask_of_mempolicy
1871 *
1872 * If the current task's mempolicy is "default" [NULL], return 'false'
1873 * to indicate default policy. Otherwise, extract the policy nodemask
1874 * for 'bind' or 'interleave' policy into the argument nodemask, or
1875 * initialize the argument nodemask to contain the single node for
1876 * 'preferred' or 'local' policy and return 'true' to indicate presence
1877 * of non-default mempolicy.
1878 *
1879 * We don't bother with reference counting the mempolicy [mpol_get/put]
1880 * because the current task is examining it's own mempolicy and a task's
1881 * mempolicy is only ever changed by the task itself.
1882 *
1883 * N.B., it is the caller's responsibility to free a returned nodemask.
1884 */
1885 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1886 {
1887 struct mempolicy *mempolicy;
1888 int nid;
1889
1890 if (!(mask && current->mempolicy))
1891 return false;
1892
1893 task_lock(current);
1894 mempolicy = current->mempolicy;
1895 switch (mempolicy->mode) {
1896 case MPOL_PREFERRED:
1897 if (mempolicy->flags & MPOL_F_LOCAL)
1898 nid = numa_node_id();
1899 else
1900 nid = mempolicy->v.preferred_node;
1901 init_nodemask_of_node(mask, nid);
1902 break;
1903
1904 case MPOL_BIND:
1905 /* Fall through */
1906 case MPOL_INTERLEAVE:
1907 *mask = mempolicy->v.nodes;
1908 break;
1909
1910 default:
1911 BUG();
1912 }
1913 task_unlock(current);
1914
1915 return true;
1916 }
1917 #endif
1918
1919 /*
1920 * mempolicy_nodemask_intersects
1921 *
1922 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1923 * policy. Otherwise, check for intersection between mask and the policy
1924 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1925 * policy, always return true since it may allocate elsewhere on fallback.
1926 *
1927 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1928 */
1929 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1930 const nodemask_t *mask)
1931 {
1932 struct mempolicy *mempolicy;
1933 bool ret = true;
1934
1935 if (!mask)
1936 return ret;
1937 task_lock(tsk);
1938 mempolicy = tsk->mempolicy;
1939 if (!mempolicy)
1940 goto out;
1941
1942 switch (mempolicy->mode) {
1943 case MPOL_PREFERRED:
1944 /*
1945 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1946 * allocate from, they may fallback to other nodes when oom.
1947 * Thus, it's possible for tsk to have allocated memory from
1948 * nodes in mask.
1949 */
1950 break;
1951 case MPOL_BIND:
1952 case MPOL_INTERLEAVE:
1953 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1954 break;
1955 default:
1956 BUG();
1957 }
1958 out:
1959 task_unlock(tsk);
1960 return ret;
1961 }
1962
1963 /* Allocate a page in interleaved policy.
1964 Own path because it needs to do special accounting. */
1965 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1966 unsigned nid)
1967 {
1968 struct page *page;
1969
1970 page = __alloc_pages(gfp, order, nid);
1971 if (page && page_to_nid(page) == nid) {
1972 preempt_disable();
1973 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
1974 preempt_enable();
1975 }
1976 return page;
1977 }
1978
1979 /**
1980 * alloc_pages_vma - Allocate a page for a VMA.
1981 *
1982 * @gfp:
1983 * %GFP_USER user allocation.
1984 * %GFP_KERNEL kernel allocations,
1985 * %GFP_HIGHMEM highmem/user allocations,
1986 * %GFP_FS allocation should not call back into a file system.
1987 * %GFP_ATOMIC don't sleep.
1988 *
1989 * @order:Order of the GFP allocation.
1990 * @vma: Pointer to VMA or NULL if not available.
1991 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1992 * @node: Which node to prefer for allocation (modulo policy).
1993 * @hugepage: for hugepages try only the preferred node if possible
1994 *
1995 * This function allocates a page from the kernel page pool and applies
1996 * a NUMA policy associated with the VMA or the current process.
1997 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1998 * mm_struct of the VMA to prevent it from going away. Should be used for
1999 * all allocations for pages that will be mapped into user space. Returns
2000 * NULL when no page can be allocated.
2001 */
2002 struct page *
2003 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2004 unsigned long addr, int node, bool hugepage)
2005 {
2006 struct mempolicy *pol;
2007 struct page *page;
2008 int preferred_nid;
2009 nodemask_t *nmask;
2010
2011 pol = get_vma_policy(vma, addr);
2012
2013 if (pol->mode == MPOL_INTERLEAVE) {
2014 unsigned nid;
2015
2016 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017 mpol_cond_put(pol);
2018 page = alloc_page_interleave(gfp, order, nid);
2019 goto out;
2020 }
2021
2022 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2023 int hpage_node = node;
2024
2025 /*
2026 * For hugepage allocation and non-interleave policy which
2027 * allows the current node (or other explicitly preferred
2028 * node) we only try to allocate from the current/preferred
2029 * node and don't fall back to other nodes, as the cost of
2030 * remote accesses would likely offset THP benefits.
2031 *
2032 * If the policy is interleave, or does not allow the current
2033 * node in its nodemask, we allocate the standard way.
2034 */
2035 if (pol->mode == MPOL_PREFERRED &&
2036 !(pol->flags & MPOL_F_LOCAL))
2037 hpage_node = pol->v.preferred_node;
2038
2039 nmask = policy_nodemask(gfp, pol);
2040 if (!nmask || node_isset(hpage_node, *nmask)) {
2041 mpol_cond_put(pol);
2042 /*
2043 * We cannot invoke reclaim if __GFP_THISNODE
2044 * is set. Invoking reclaim with
2045 * __GFP_THISNODE set, would cause THP
2046 * allocations to trigger heavy swapping
2047 * despite there may be tons of free memory
2048 * (including potentially plenty of THP
2049 * already available in the buddy) on all the
2050 * other NUMA nodes.
2051 *
2052 * At most we could invoke compaction when
2053 * __GFP_THISNODE is set (but we would need to
2054 * refrain from invoking reclaim even if
2055 * compaction returned COMPACT_SKIPPED because
2056 * there wasn't not enough memory to succeed
2057 * compaction). For now just avoid
2058 * __GFP_THISNODE instead of limiting the
2059 * allocation path to a strict and single
2060 * compaction invocation.
2061 *
2062 * Supposedly if direct reclaim was enabled by
2063 * the caller, the app prefers THP regardless
2064 * of the node it comes from so this would be
2065 * more desiderable behavior than only
2066 * providing THP originated from the local
2067 * node in such case.
2068 */
2069 if (!(gfp & __GFP_DIRECT_RECLAIM))
2070 gfp |= __GFP_THISNODE;
2071 page = __alloc_pages_node(hpage_node, gfp, order);
2072 goto out;
2073 }
2074 }
2075
2076 nmask = policy_nodemask(gfp, pol);
2077 preferred_nid = policy_node(gfp, pol, node);
2078 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2079 mpol_cond_put(pol);
2080 out:
2081 return page;
2082 }
2083
2084 /**
2085 * alloc_pages_current - Allocate pages.
2086 *
2087 * @gfp:
2088 * %GFP_USER user allocation,
2089 * %GFP_KERNEL kernel allocation,
2090 * %GFP_HIGHMEM highmem allocation,
2091 * %GFP_FS don't call back into a file system.
2092 * %GFP_ATOMIC don't sleep.
2093 * @order: Power of two of allocation size in pages. 0 is a single page.
2094 *
2095 * Allocate a page from the kernel page pool. When not in
2096 * interrupt context and apply the current process NUMA policy.
2097 * Returns NULL when no page can be allocated.
2098 */
2099 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2100 {
2101 struct mempolicy *pol = &default_policy;
2102 struct page *page;
2103
2104 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2105 pol = get_task_policy(current);
2106
2107 /*
2108 * No reference counting needed for current->mempolicy
2109 * nor system default_policy
2110 */
2111 if (pol->mode == MPOL_INTERLEAVE)
2112 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2113 else
2114 page = __alloc_pages_nodemask(gfp, order,
2115 policy_node(gfp, pol, numa_node_id()),
2116 policy_nodemask(gfp, pol));
2117
2118 return page;
2119 }
2120 EXPORT_SYMBOL(alloc_pages_current);
2121
2122 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2123 {
2124 struct mempolicy *pol = mpol_dup(vma_policy(src));
2125
2126 if (IS_ERR(pol))
2127 return PTR_ERR(pol);
2128 dst->vm_policy = pol;
2129 return 0;
2130 }
2131
2132 /*
2133 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2134 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2135 * with the mems_allowed returned by cpuset_mems_allowed(). This
2136 * keeps mempolicies cpuset relative after its cpuset moves. See
2137 * further kernel/cpuset.c update_nodemask().
2138 *
2139 * current's mempolicy may be rebinded by the other task(the task that changes
2140 * cpuset's mems), so we needn't do rebind work for current task.
2141 */
2142
2143 /* Slow path of a mempolicy duplicate */
2144 struct mempolicy *__mpol_dup(struct mempolicy *old)
2145 {
2146 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2147
2148 if (!new)
2149 return ERR_PTR(-ENOMEM);
2150
2151 /* task's mempolicy is protected by alloc_lock */
2152 if (old == current->mempolicy) {
2153 task_lock(current);
2154 *new = *old;
2155 task_unlock(current);
2156 } else
2157 *new = *old;
2158
2159 if (current_cpuset_is_being_rebound()) {
2160 nodemask_t mems = cpuset_mems_allowed(current);
2161 mpol_rebind_policy(new, &mems);
2162 }
2163 atomic_set(&new->refcnt, 1);
2164 return new;
2165 }
2166
2167 /* Slow path of a mempolicy comparison */
2168 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2169 {
2170 if (!a || !b)
2171 return false;
2172 if (a->mode != b->mode)
2173 return false;
2174 if (a->flags != b->flags)
2175 return false;
2176 if (mpol_store_user_nodemask(a))
2177 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2178 return false;
2179
2180 switch (a->mode) {
2181 case MPOL_BIND:
2182 /* Fall through */
2183 case MPOL_INTERLEAVE:
2184 return !!nodes_equal(a->v.nodes, b->v.nodes);
2185 case MPOL_PREFERRED:
2186 /* a's ->flags is the same as b's */
2187 if (a->flags & MPOL_F_LOCAL)
2188 return true;
2189 return a->v.preferred_node == b->v.preferred_node;
2190 default:
2191 BUG();
2192 return false;
2193 }
2194 }
2195
2196 /*
2197 * Shared memory backing store policy support.
2198 *
2199 * Remember policies even when nobody has shared memory mapped.
2200 * The policies are kept in Red-Black tree linked from the inode.
2201 * They are protected by the sp->lock rwlock, which should be held
2202 * for any accesses to the tree.
2203 */
2204
2205 /*
2206 * lookup first element intersecting start-end. Caller holds sp->lock for
2207 * reading or for writing
2208 */
2209 static struct sp_node *
2210 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2211 {
2212 struct rb_node *n = sp->root.rb_node;
2213
2214 while (n) {
2215 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2216
2217 if (start >= p->end)
2218 n = n->rb_right;
2219 else if (end <= p->start)
2220 n = n->rb_left;
2221 else
2222 break;
2223 }
2224 if (!n)
2225 return NULL;
2226 for (;;) {
2227 struct sp_node *w = NULL;
2228 struct rb_node *prev = rb_prev(n);
2229 if (!prev)
2230 break;
2231 w = rb_entry(prev, struct sp_node, nd);
2232 if (w->end <= start)
2233 break;
2234 n = prev;
2235 }
2236 return rb_entry(n, struct sp_node, nd);
2237 }
2238
2239 /*
2240 * Insert a new shared policy into the list. Caller holds sp->lock for
2241 * writing.
2242 */
2243 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2244 {
2245 struct rb_node **p = &sp->root.rb_node;
2246 struct rb_node *parent = NULL;
2247 struct sp_node *nd;
2248
2249 while (*p) {
2250 parent = *p;
2251 nd = rb_entry(parent, struct sp_node, nd);
2252 if (new->start < nd->start)
2253 p = &(*p)->rb_left;
2254 else if (new->end > nd->end)
2255 p = &(*p)->rb_right;
2256 else
2257 BUG();
2258 }
2259 rb_link_node(&new->nd, parent, p);
2260 rb_insert_color(&new->nd, &sp->root);
2261 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2262 new->policy ? new->policy->mode : 0);
2263 }
2264
2265 /* Find shared policy intersecting idx */
2266 struct mempolicy *
2267 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2268 {
2269 struct mempolicy *pol = NULL;
2270 struct sp_node *sn;
2271
2272 if (!sp->root.rb_node)
2273 return NULL;
2274 read_lock(&sp->lock);
2275 sn = sp_lookup(sp, idx, idx+1);
2276 if (sn) {
2277 mpol_get(sn->policy);
2278 pol = sn->policy;
2279 }
2280 read_unlock(&sp->lock);
2281 return pol;
2282 }
2283
2284 static void sp_free(struct sp_node *n)
2285 {
2286 mpol_put(n->policy);
2287 kmem_cache_free(sn_cache, n);
2288 }
2289
2290 /**
2291 * mpol_misplaced - check whether current page node is valid in policy
2292 *
2293 * @page: page to be checked
2294 * @vma: vm area where page mapped
2295 * @addr: virtual address where page mapped
2296 *
2297 * Lookup current policy node id for vma,addr and "compare to" page's
2298 * node id.
2299 *
2300 * Returns:
2301 * -1 - not misplaced, page is in the right node
2302 * node - node id where the page should be
2303 *
2304 * Policy determination "mimics" alloc_page_vma().
2305 * Called from fault path where we know the vma and faulting address.
2306 */
2307 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2308 {
2309 struct mempolicy *pol;
2310 struct zoneref *z;
2311 int curnid = page_to_nid(page);
2312 unsigned long pgoff;
2313 int thiscpu = raw_smp_processor_id();
2314 int thisnid = cpu_to_node(thiscpu);
2315 int polnid = -1;
2316 int ret = -1;
2317
2318 pol = get_vma_policy(vma, addr);
2319 if (!(pol->flags & MPOL_F_MOF))
2320 goto out;
2321
2322 switch (pol->mode) {
2323 case MPOL_INTERLEAVE:
2324 pgoff = vma->vm_pgoff;
2325 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2326 polnid = offset_il_node(pol, pgoff);
2327 break;
2328
2329 case MPOL_PREFERRED:
2330 if (pol->flags & MPOL_F_LOCAL)
2331 polnid = numa_node_id();
2332 else
2333 polnid = pol->v.preferred_node;
2334 break;
2335
2336 case MPOL_BIND:
2337
2338 /*
2339 * allows binding to multiple nodes.
2340 * use current page if in policy nodemask,
2341 * else select nearest allowed node, if any.
2342 * If no allowed nodes, use current [!misplaced].
2343 */
2344 if (node_isset(curnid, pol->v.nodes))
2345 goto out;
2346 z = first_zones_zonelist(
2347 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2348 gfp_zone(GFP_HIGHUSER),
2349 &pol->v.nodes);
2350 polnid = z->zone->node;
2351 break;
2352
2353 default:
2354 BUG();
2355 }
2356
2357 /* Migrate the page towards the node whose CPU is referencing it */
2358 if (pol->flags & MPOL_F_MORON) {
2359 polnid = thisnid;
2360
2361 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2362 goto out;
2363 }
2364
2365 if (curnid != polnid)
2366 ret = polnid;
2367 out:
2368 mpol_cond_put(pol);
2369
2370 return ret;
2371 }
2372
2373 /*
2374 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2375 * dropped after task->mempolicy is set to NULL so that any allocation done as
2376 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2377 * policy.
2378 */
2379 void mpol_put_task_policy(struct task_struct *task)
2380 {
2381 struct mempolicy *pol;
2382
2383 task_lock(task);
2384 pol = task->mempolicy;
2385 task->mempolicy = NULL;
2386 task_unlock(task);
2387 mpol_put(pol);
2388 }
2389
2390 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2391 {
2392 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2393 rb_erase(&n->nd, &sp->root);
2394 sp_free(n);
2395 }
2396
2397 static void sp_node_init(struct sp_node *node, unsigned long start,
2398 unsigned long end, struct mempolicy *pol)
2399 {
2400 node->start = start;
2401 node->end = end;
2402 node->policy = pol;
2403 }
2404
2405 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2406 struct mempolicy *pol)
2407 {
2408 struct sp_node *n;
2409 struct mempolicy *newpol;
2410
2411 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2412 if (!n)
2413 return NULL;
2414
2415 newpol = mpol_dup(pol);
2416 if (IS_ERR(newpol)) {
2417 kmem_cache_free(sn_cache, n);
2418 return NULL;
2419 }
2420 newpol->flags |= MPOL_F_SHARED;
2421 sp_node_init(n, start, end, newpol);
2422
2423 return n;
2424 }
2425
2426 /* Replace a policy range. */
2427 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2428 unsigned long end, struct sp_node *new)
2429 {
2430 struct sp_node *n;
2431 struct sp_node *n_new = NULL;
2432 struct mempolicy *mpol_new = NULL;
2433 int ret = 0;
2434
2435 restart:
2436 write_lock(&sp->lock);
2437 n = sp_lookup(sp, start, end);
2438 /* Take care of old policies in the same range. */
2439 while (n && n->start < end) {
2440 struct rb_node *next = rb_next(&n->nd);
2441 if (n->start >= start) {
2442 if (n->end <= end)
2443 sp_delete(sp, n);
2444 else
2445 n->start = end;
2446 } else {
2447 /* Old policy spanning whole new range. */
2448 if (n->end > end) {
2449 if (!n_new)
2450 goto alloc_new;
2451
2452 *mpol_new = *n->policy;
2453 atomic_set(&mpol_new->refcnt, 1);
2454 sp_node_init(n_new, end, n->end, mpol_new);
2455 n->end = start;
2456 sp_insert(sp, n_new);
2457 n_new = NULL;
2458 mpol_new = NULL;
2459 break;
2460 } else
2461 n->end = start;
2462 }
2463 if (!next)
2464 break;
2465 n = rb_entry(next, struct sp_node, nd);
2466 }
2467 if (new)
2468 sp_insert(sp, new);
2469 write_unlock(&sp->lock);
2470 ret = 0;
2471
2472 err_out:
2473 if (mpol_new)
2474 mpol_put(mpol_new);
2475 if (n_new)
2476 kmem_cache_free(sn_cache, n_new);
2477
2478 return ret;
2479
2480 alloc_new:
2481 write_unlock(&sp->lock);
2482 ret = -ENOMEM;
2483 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2484 if (!n_new)
2485 goto err_out;
2486 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2487 if (!mpol_new)
2488 goto err_out;
2489 goto restart;
2490 }
2491
2492 /**
2493 * mpol_shared_policy_init - initialize shared policy for inode
2494 * @sp: pointer to inode shared policy
2495 * @mpol: struct mempolicy to install
2496 *
2497 * Install non-NULL @mpol in inode's shared policy rb-tree.
2498 * On entry, the current task has a reference on a non-NULL @mpol.
2499 * This must be released on exit.
2500 * This is called at get_inode() calls and we can use GFP_KERNEL.
2501 */
2502 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2503 {
2504 int ret;
2505
2506 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2507 rwlock_init(&sp->lock);
2508
2509 if (mpol) {
2510 struct vm_area_struct pvma;
2511 struct mempolicy *new;
2512 NODEMASK_SCRATCH(scratch);
2513
2514 if (!scratch)
2515 goto put_mpol;
2516 /* contextualize the tmpfs mount point mempolicy */
2517 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2518 if (IS_ERR(new))
2519 goto free_scratch; /* no valid nodemask intersection */
2520
2521 task_lock(current);
2522 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2523 task_unlock(current);
2524 if (ret)
2525 goto put_new;
2526
2527 /* Create pseudo-vma that contains just the policy */
2528 memset(&pvma, 0, sizeof(struct vm_area_struct));
2529 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2530 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2531
2532 put_new:
2533 mpol_put(new); /* drop initial ref */
2534 free_scratch:
2535 NODEMASK_SCRATCH_FREE(scratch);
2536 put_mpol:
2537 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2538 }
2539 }
2540
2541 int mpol_set_shared_policy(struct shared_policy *info,
2542 struct vm_area_struct *vma, struct mempolicy *npol)
2543 {
2544 int err;
2545 struct sp_node *new = NULL;
2546 unsigned long sz = vma_pages(vma);
2547
2548 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2549 vma->vm_pgoff,
2550 sz, npol ? npol->mode : -1,
2551 npol ? npol->flags : -1,
2552 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2553
2554 if (npol) {
2555 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2556 if (!new)
2557 return -ENOMEM;
2558 }
2559 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2560 if (err && new)
2561 sp_free(new);
2562 return err;
2563 }
2564
2565 /* Free a backing policy store on inode delete. */
2566 void mpol_free_shared_policy(struct shared_policy *p)
2567 {
2568 struct sp_node *n;
2569 struct rb_node *next;
2570
2571 if (!p->root.rb_node)
2572 return;
2573 write_lock(&p->lock);
2574 next = rb_first(&p->root);
2575 while (next) {
2576 n = rb_entry(next, struct sp_node, nd);
2577 next = rb_next(&n->nd);
2578 sp_delete(p, n);
2579 }
2580 write_unlock(&p->lock);
2581 }
2582
2583 #ifdef CONFIG_NUMA_BALANCING
2584 static int __initdata numabalancing_override;
2585
2586 static void __init check_numabalancing_enable(void)
2587 {
2588 bool numabalancing_default = false;
2589
2590 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2591 numabalancing_default = true;
2592
2593 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2594 if (numabalancing_override)
2595 set_numabalancing_state(numabalancing_override == 1);
2596
2597 if (num_online_nodes() > 1 && !numabalancing_override) {
2598 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2599 numabalancing_default ? "Enabling" : "Disabling");
2600 set_numabalancing_state(numabalancing_default);
2601 }
2602 }
2603
2604 static int __init setup_numabalancing(char *str)
2605 {
2606 int ret = 0;
2607 if (!str)
2608 goto out;
2609
2610 if (!strcmp(str, "enable")) {
2611 numabalancing_override = 1;
2612 ret = 1;
2613 } else if (!strcmp(str, "disable")) {
2614 numabalancing_override = -1;
2615 ret = 1;
2616 }
2617 out:
2618 if (!ret)
2619 pr_warn("Unable to parse numa_balancing=\n");
2620
2621 return ret;
2622 }
2623 __setup("numa_balancing=", setup_numabalancing);
2624 #else
2625 static inline void __init check_numabalancing_enable(void)
2626 {
2627 }
2628 #endif /* CONFIG_NUMA_BALANCING */
2629
2630 /* assumes fs == KERNEL_DS */
2631 void __init numa_policy_init(void)
2632 {
2633 nodemask_t interleave_nodes;
2634 unsigned long largest = 0;
2635 int nid, prefer = 0;
2636
2637 policy_cache = kmem_cache_create("numa_policy",
2638 sizeof(struct mempolicy),
2639 0, SLAB_PANIC, NULL);
2640
2641 sn_cache = kmem_cache_create("shared_policy_node",
2642 sizeof(struct sp_node),
2643 0, SLAB_PANIC, NULL);
2644
2645 for_each_node(nid) {
2646 preferred_node_policy[nid] = (struct mempolicy) {
2647 .refcnt = ATOMIC_INIT(1),
2648 .mode = MPOL_PREFERRED,
2649 .flags = MPOL_F_MOF | MPOL_F_MORON,
2650 .v = { .preferred_node = nid, },
2651 };
2652 }
2653
2654 /*
2655 * Set interleaving policy for system init. Interleaving is only
2656 * enabled across suitably sized nodes (default is >= 16MB), or
2657 * fall back to the largest node if they're all smaller.
2658 */
2659 nodes_clear(interleave_nodes);
2660 for_each_node_state(nid, N_MEMORY) {
2661 unsigned long total_pages = node_present_pages(nid);
2662
2663 /* Preserve the largest node */
2664 if (largest < total_pages) {
2665 largest = total_pages;
2666 prefer = nid;
2667 }
2668
2669 /* Interleave this node? */
2670 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2671 node_set(nid, interleave_nodes);
2672 }
2673
2674 /* All too small, use the largest */
2675 if (unlikely(nodes_empty(interleave_nodes)))
2676 node_set(prefer, interleave_nodes);
2677
2678 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2679 pr_err("%s: interleaving failed\n", __func__);
2680
2681 check_numabalancing_enable();
2682 }
2683
2684 /* Reset policy of current process to default */
2685 void numa_default_policy(void)
2686 {
2687 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2688 }
2689
2690 /*
2691 * Parse and format mempolicy from/to strings
2692 */
2693
2694 /*
2695 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2696 */
2697 static const char * const policy_modes[] =
2698 {
2699 [MPOL_DEFAULT] = "default",
2700 [MPOL_PREFERRED] = "prefer",
2701 [MPOL_BIND] = "bind",
2702 [MPOL_INTERLEAVE] = "interleave",
2703 [MPOL_LOCAL] = "local",
2704 };
2705
2706
2707 #ifdef CONFIG_TMPFS
2708 /**
2709 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2710 * @str: string containing mempolicy to parse
2711 * @mpol: pointer to struct mempolicy pointer, returned on success.
2712 *
2713 * Format of input:
2714 * <mode>[=<flags>][:<nodelist>]
2715 *
2716 * On success, returns 0, else 1
2717 */
2718 int mpol_parse_str(char *str, struct mempolicy **mpol)
2719 {
2720 struct mempolicy *new = NULL;
2721 unsigned short mode;
2722 unsigned short mode_flags;
2723 nodemask_t nodes;
2724 char *nodelist = strchr(str, ':');
2725 char *flags = strchr(str, '=');
2726 int err = 1;
2727
2728 if (nodelist) {
2729 /* NUL-terminate mode or flags string */
2730 *nodelist++ = '\0';
2731 if (nodelist_parse(nodelist, nodes))
2732 goto out;
2733 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2734 goto out;
2735 } else
2736 nodes_clear(nodes);
2737
2738 if (flags)
2739 *flags++ = '\0'; /* terminate mode string */
2740
2741 for (mode = 0; mode < MPOL_MAX; mode++) {
2742 if (!strcmp(str, policy_modes[mode])) {
2743 break;
2744 }
2745 }
2746 if (mode >= MPOL_MAX)
2747 goto out;
2748
2749 switch (mode) {
2750 case MPOL_PREFERRED:
2751 /*
2752 * Insist on a nodelist of one node only
2753 */
2754 if (nodelist) {
2755 char *rest = nodelist;
2756 while (isdigit(*rest))
2757 rest++;
2758 if (*rest)
2759 goto out;
2760 }
2761 break;
2762 case MPOL_INTERLEAVE:
2763 /*
2764 * Default to online nodes with memory if no nodelist
2765 */
2766 if (!nodelist)
2767 nodes = node_states[N_MEMORY];
2768 break;
2769 case MPOL_LOCAL:
2770 /*
2771 * Don't allow a nodelist; mpol_new() checks flags
2772 */
2773 if (nodelist)
2774 goto out;
2775 mode = MPOL_PREFERRED;
2776 break;
2777 case MPOL_DEFAULT:
2778 /*
2779 * Insist on a empty nodelist
2780 */
2781 if (!nodelist)
2782 err = 0;
2783 goto out;
2784 case MPOL_BIND:
2785 /*
2786 * Insist on a nodelist
2787 */
2788 if (!nodelist)
2789 goto out;
2790 }
2791
2792 mode_flags = 0;
2793 if (flags) {
2794 /*
2795 * Currently, we only support two mutually exclusive
2796 * mode flags.
2797 */
2798 if (!strcmp(flags, "static"))
2799 mode_flags |= MPOL_F_STATIC_NODES;
2800 else if (!strcmp(flags, "relative"))
2801 mode_flags |= MPOL_F_RELATIVE_NODES;
2802 else
2803 goto out;
2804 }
2805
2806 new = mpol_new(mode, mode_flags, &nodes);
2807 if (IS_ERR(new))
2808 goto out;
2809
2810 /*
2811 * Save nodes for mpol_to_str() to show the tmpfs mount options
2812 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2813 */
2814 if (mode != MPOL_PREFERRED)
2815 new->v.nodes = nodes;
2816 else if (nodelist)
2817 new->v.preferred_node = first_node(nodes);
2818 else
2819 new->flags |= MPOL_F_LOCAL;
2820
2821 /*
2822 * Save nodes for contextualization: this will be used to "clone"
2823 * the mempolicy in a specific context [cpuset] at a later time.
2824 */
2825 new->w.user_nodemask = nodes;
2826
2827 err = 0;
2828
2829 out:
2830 /* Restore string for error message */
2831 if (nodelist)
2832 *--nodelist = ':';
2833 if (flags)
2834 *--flags = '=';
2835 if (!err)
2836 *mpol = new;
2837 return err;
2838 }
2839 #endif /* CONFIG_TMPFS */
2840
2841 /**
2842 * mpol_to_str - format a mempolicy structure for printing
2843 * @buffer: to contain formatted mempolicy string
2844 * @maxlen: length of @buffer
2845 * @pol: pointer to mempolicy to be formatted
2846 *
2847 * Convert @pol into a string. If @buffer is too short, truncate the string.
2848 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2849 * longest flag, "relative", and to display at least a few node ids.
2850 */
2851 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2852 {
2853 char *p = buffer;
2854 nodemask_t nodes = NODE_MASK_NONE;
2855 unsigned short mode = MPOL_DEFAULT;
2856 unsigned short flags = 0;
2857
2858 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2859 mode = pol->mode;
2860 flags = pol->flags;
2861 }
2862
2863 switch (mode) {
2864 case MPOL_DEFAULT:
2865 break;
2866 case MPOL_PREFERRED:
2867 if (flags & MPOL_F_LOCAL)
2868 mode = MPOL_LOCAL;
2869 else
2870 node_set(pol->v.preferred_node, nodes);
2871 break;
2872 case MPOL_BIND:
2873 case MPOL_INTERLEAVE:
2874 nodes = pol->v.nodes;
2875 break;
2876 default:
2877 WARN_ON_ONCE(1);
2878 snprintf(p, maxlen, "unknown");
2879 return;
2880 }
2881
2882 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2883
2884 if (flags & MPOL_MODE_FLAGS) {
2885 p += snprintf(p, buffer + maxlen - p, "=");
2886
2887 /*
2888 * Currently, the only defined flags are mutually exclusive
2889 */
2890 if (flags & MPOL_F_STATIC_NODES)
2891 p += snprintf(p, buffer + maxlen - p, "static");
2892 else if (flags & MPOL_F_RELATIVE_NODES)
2893 p += snprintf(p, buffer + maxlen - p, "relative");
2894 }
2895
2896 if (!nodes_empty(nodes))
2897 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2898 nodemask_pr_args(&nodes));
2899 }