mm: rework virtual memory accounting
authorKonstantin Khlebnikov <koct9i@gmail.com>
Thu, 14 Jan 2016 23:22:07 +0000 (15:22 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Jan 2016 00:00:49 +0000 (16:00 -0800)
When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which
testing the RLIMIT_DATA value to figure out if we're allowed to assign
new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been
commited that RLIMIT_DATA in a form it's implemented now doesn't do
anything useful because most of user-space libraries use mmap() syscall
for dynamic memory allocations.

Linus suggested to convert RLIMIT_DATA rlimit into something suitable
for anonymous memory accounting.  But in this patch we go further, and
the changes are bundled together as:

 * keep vma counting if CONFIG_PROC_FS=n, will be used for limits
 * replace mm->shared_vm with better defined mm->data_vm
 * account anonymous executable areas as executable
 * account file-backed growsdown/up areas as stack
 * drop struct file* argument from vm_stat_account
 * enforce RLIMIT_DATA for size of data areas

This way code looks cleaner: now code/stack/data classification depends
only on vm_flags state:

 VM_EXEC & ~VM_WRITE            -> code  (VmExe + VmLib in proc)
 VM_GROWSUP | VM_GROWSDOWN      -> stack (VmStk)
 VM_WRITE & ~VM_SHARED & !stack -> data  (VmData)

The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called
"shared", but that might be strange beast like readonly-private or VM_IO
area.

 - RLIMIT_AS            limits whole address space "VmSize"
 - RLIMIT_STACK         limits stack "VmStk" (but each vma individually)
 - RLIMIT_DATA          now limits "VmData"

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@google.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
arch/ia64/kernel/perfmon.c
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/mm_types.h
kernel/fork.c
mm/debug.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c

index 60e02f7747ff054e4edcce16daa454df8224ed84..9cd607b06964522859028d65fd920aacb1e61b2c 100644 (file)
@@ -2332,8 +2332,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
         */
        insert_vm_struct(mm, vma);
 
-       vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-                                                       vma_pages(vma));
+       vm_stat_account(vma->vm_mm, vma->vm_flags, vma_pages(vma));
        up_write(&task->mm->mmap_sem);
 
        /*
index 46d9619d0aea3b8a65480331366d3788702c865e..a353b4c6e86e5d24007ac4f9ead1d80d577037d6 100644 (file)
@@ -23,7 +23,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-       unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
+       unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
        anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        if (hiwater_rss < mm->hiwater_rss)
                hiwater_rss = mm->hiwater_rss;
 
-       data = mm->total_vm - mm->shared_vm - mm->stack_vm;
        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
        swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -76,7 +75,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                anon << (PAGE_SHIFT-10),
                file << (PAGE_SHIFT-10),
                shmem << (PAGE_SHIFT-10),
-               data << (PAGE_SHIFT-10),
+               mm->data_vm << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
                ptes >> 10,
                pmds >> 10,
@@ -97,7 +96,7 @@ unsigned long task_statm(struct mm_struct *mm,
                        get_mm_counter(mm, MM_SHMEMPAGES);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
                                                                >> PAGE_SHIFT;
-       *data = mm->total_vm - mm->shared_vm;
+       *data = mm->data_vm + mm->stack_vm;
        *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
        return mm->total_vm;
 }
index ec9d4559514d6ac765f1e74d91c60ef04055c64f..839d9e9a1c38618c9a8a2c5f18682c75d22f1348 100644 (file)
@@ -1929,7 +1929,9 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 
-extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
+extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
+extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
+
 extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
@@ -2147,15 +2149,6 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
 
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
-#else
-static inline void vm_stat_account(struct mm_struct *mm,
-                       unsigned long flags, struct file *file, long pages)
-{
-       mm->total_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern bool _debug_pagealloc_enabled;
index 207890be93c8c7e6eadd59ab21815f1059465c87..6bc9a0ce22530cd260c5d50736875aedabd70f76 100644 (file)
@@ -427,7 +427,7 @@ struct mm_struct {
        unsigned long total_vm;         /* Total pages mapped */
        unsigned long locked_vm;        /* Pages that have PG_mlocked set */
        unsigned long pinned_vm;        /* Refcount permanently increased */
-       unsigned long shared_vm;        /* Shared pages (files) */
+       unsigned long data_vm;          /* VM_WRITE & ~VM_SHARED/GROWSDOWN */
        unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE */
        unsigned long stack_vm;         /* VM_GROWSUP/DOWN */
        unsigned long def_flags;
index 51915842f1c0644b8abbf29b6261f4e64357a104..2e391c754ae730bd2d8520c2ab497c403220c6e3 100644 (file)
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
 
        mm->total_vm = oldmm->total_vm;
-       mm->shared_vm = oldmm->shared_vm;
+       mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;
 
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
 
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                       vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                       -vma_pages(mpnt));
+                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
index 668aa35191ca1243d9be055b5e03ed9b17e17976..5d2072ed8d5e7c925877b0759163035a9a2a0fe0 100644 (file)
@@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm)
                "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
                "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
                "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
-               "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+               "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
                "start_code %lx end_code %lx start_data %lx end_data %lx\n"
                "start_brk %lx brk %lx start_stack %lx\n"
                "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm)
                mm_nr_pmds((struct mm_struct *)mm),
                mm->map_count,
                mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
-               mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+               mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
                mm->start_code, mm->end_code, mm->start_data, mm->end_data,
                mm->start_brk, mm->brk, mm->start_stack,
                mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
index f32b84ad621a292ac2a83276ea8ec3494a37395b..b3f00b616b810e4362effddd437b320acdcedb7e 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,24 +1220,6 @@ none:
        return NULL;
 }
 
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
-                                               struct file *file, long pages)
-{
-       const unsigned long stack_flags
-               = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-
-       mm->total_vm += pages;
-
-       if (file) {
-               mm->shared_vm += pages;
-               if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
-                       mm->exec_vm += pages;
-       } else if (flags & stack_flags)
-               mm->stack_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
-
 /*
  * If a hint addr is less than mmap_min_addr change hint to be as
  * low as possible but still greater than mmap_min_addr
@@ -1556,7 +1538,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long charged = 0;
 
        /* Check against address space limit. */
-       if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+       if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
                unsigned long nr_pages;
 
                /*
@@ -1565,7 +1547,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
                 */
                nr_pages = count_vma_pages_range(mm, addr, addr + len);
 
-               if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+               if (!may_expand_vm(mm, vm_flags,
+                                       (len >> PAGE_SHIFT) - nr_pages))
                        return -ENOMEM;
        }
 
@@ -1664,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 out:
        perf_event_mmap(vma);
 
-       vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+       vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm)))
@@ -2111,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
        unsigned long new_start, actual_size;
 
        /* address space limit tests */
-       if (!may_expand_vm(mm, grow))
+       if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;
 
        /* Stack limit test */
@@ -2208,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
-                               vm_stat_account(mm, vma->vm_flags,
-                                               vma->vm_file, grow);
+                               vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
@@ -2284,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma,
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
-                               vm_stat_account(mm, vma->vm_flags,
-                                               vma->vm_file, grow);
+                               vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
@@ -2399,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
-               vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+               vm_stat_account(mm, vma->vm_flags, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
        vm_unacct_memory(nr_accounted);
@@ -2769,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        }
 
        /* Check against address space limits *after* clearing old maps... */
-       if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+       if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;
 
        if (mm->map_count > sysctl_max_map_count)
@@ -2804,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
+       mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vma->vm_flags |= VM_SOFTDIRTY;
@@ -2995,9 +2977,28 @@ out:
  * Return true if the calling process may expand its vm space by the passed
  * number of pages
  */
-int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
 {
-       return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT;
+       if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+               return false;
+
+       if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
+                               (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
+               return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+
+       return true;
+}
+
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+       mm->total_vm += npages;
+
+       if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+               mm->exec_vm += npages;
+       else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+               mm->stack_vm += npages;
+       else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+               mm->data_vm += npages;
 }
 
 static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3079,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping(
        if (ret)
                goto out;
 
-       mm->total_vm += len >> PAGE_SHIFT;
+       vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
 
        perf_event_mmap(vma);
 
index ef5be8eaab001792b469fac1bd5b43cb139d1b0b..c764402c464f10471d4b267fa26911f46e87e149 100644 (file)
@@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * even if read-only so there is no need to account for them here
         */
        if (newflags & VM_WRITE) {
+               /* Check space limits when area turns into data. */
+               if (!may_expand_vm(mm, newflags, nrpages) &&
+                               may_expand_vm(mm, oldflags, nrpages))
+                       return -ENOMEM;
                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
@@ -334,8 +338,8 @@ success:
                populate_vma_page_range(vma, start, end, NULL);
        }
 
-       vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-       vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+       vm_stat_account(mm, oldflags, -nrpages);
+       vm_stat_account(mm, newflags, nrpages);
        perf_event_mmap(vma);
        return 0;
 
index de824e72c3e89a915c429455501c644d1ae22d6e..e55b157865d5cfc437c4eacd0496dbad69d5d0ab 100644 (file)
@@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
-       vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+       vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
 
        /* Tell pfnmap has moved from this vma */
        if (unlikely(vma->vm_flags & VM_PFNMAP))
@@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
                        return ERR_PTR(-EAGAIN);
        }
 
-       if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+       if (!may_expand_vm(mm, vma->vm_flags,
+                               (new_len - old_len) >> PAGE_SHIFT))
                return ERR_PTR(-ENOMEM);
 
        if (vma->vm_flags & VM_ACCOUNT) {
@@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                                goto out;
                        }
 
-                       vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+                       vm_stat_account(mm, vma->vm_flags, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
                                locked = true;