mm: track vma changes with VM_SOFTDIRTY bit
authorCyrill Gorcunov <gorcunov@gmail.com>
Wed, 11 Sep 2013 21:22:24 +0000 (14:22 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:57:56 +0000 (15:57 -0700)
Pavel reported that in case if vma area get unmapped and then mapped (or
expanded) in-place, the soft dirty tracker won't be able to recognize this
situation since it works on pte level and ptes are get zapped on unmap,
loosing soft dirty bit of course.

So to resolve this situation we need to track actions on vma level, there
VM_SOFTDIRTY flag comes in.  When new vma area created (or old expanded)
we set this bit, and keep it here until application calls for clearing
soft dirty bit.

Thus when user space application track memory changes now it can detect if
vma area is renewed.

Reported-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Documentation/vm/soft-dirty.txt
fs/exec.c
fs/proc/task_mmu.c
include/linux/mm.h
mm/mmap.c

index 9a12a5956bc05cae45367e78e759655c924df30c..55684d11a1e806c9ee998649f9d75f88662a08df 100644 (file)
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
 the kernel does is finds this fact out and puts both writable and soft-dirty
 bits on the PTE.
 
+  While in most cases tracking memory changes by #PF-s is more than enough
+there is still a scenario when we can lose soft dirty bits -- a task
+unmaps a previously mapped memory region and then maps a new one at exactly
+the same place. When unmap is called, the kernel internally clears PTE values
+including soft dirty bits. To notify user space application about such
+memory region renewal the kernel always marks new memory regions (and
+expanded regions) as soft dirty.
 
   This feature is actively used by the checkpoint-restore project. You
 can find more details about it on http://criu.org
index fd774c7cb4831be8817799ed6cab355a368fa19f..2d1e52a58fe91c76aecc2988d789adcbe8bcc32c 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
-       vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+       vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
 
index 107d026f5d6e0cbebc068fce65f201f167c425c5..09228639b83dd7970154eba13c69eeec73422e4d 100644 (file)
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
                ptent = pte_file_clear_soft_dirty(ptent);
        }
 
+       if (vma->vm_flags & VM_SOFTDIRTY)
+               vma->vm_flags &= ~VM_SOFTDIRTY;
+
        set_pte_at(vma->vm_mm, addr, pte, ptent);
 #endif
 }
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
                if (is_migration_entry(entry))
                        page = migration_entry_to_page(entry);
        } else {
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+               if (vma->vm_flags & VM_SOFTDIRTY)
+                       flags2 |= __PM_SOFT_DIRTY;
+               *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
                return;
        }
 
        if (page && !PageAnon(page))
                flags |= PM_FILE;
-       if (pte_soft_dirty(pte))
+       if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
                flags2 |= __PM_SOFT_DIRTY;
 
        *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
                *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
                                | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
        else
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+               *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
 }
 #else
 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
                int pmd_flags2;
 
-               pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
+               if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
+                       pmd_flags2 = __PM_SOFT_DIRTY;
+               else
+                       pmd_flags2 = 0;
+
                for (; addr != end; addr += PAGE_SIZE) {
                        unsigned long offset;
 
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (pmd_trans_unstable(pmd))
                return 0;
        for (; addr != end; addr += PAGE_SIZE) {
+               int flags2;
 
                /* check to see if we've left 'vma' behind
                 * and need a new, higher one */
                if (vma && (addr >= vma->vm_end)) {
                        vma = find_vma(walk->mm, addr);
-                       pme = make_pme(PM_NOT_PRESENT(pm->v2));
+                       if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+                               flags2 = __PM_SOFT_DIRTY;
+                       else
+                               flags2 = 0;
+                       pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
                }
 
                /* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 #ifdef CONFIG_HUGETLB_PAGE
 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-                                       pte_t pte, int offset)
+                                       pte_t pte, int offset, int flags2)
 {
        if (pte_present(pte))
-               *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
-                               | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
+               *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)        |
+                               PM_STATUS2(pm->v2, flags2)              |
+                               PM_PRESENT);
        else
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+               *pme = make_pme(PM_NOT_PRESENT(pm->v2)                  |
+                               PM_STATUS2(pm->v2, flags2));
 }
 
 /* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
                                 struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
+       struct vm_area_struct *vma;
        int err = 0;
+       int flags2;
        pagemap_entry_t pme;
 
+       vma = find_vma(walk->mm, addr);
+       WARN_ON_ONCE(!vma);
+
+       if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+               flags2 = __PM_SOFT_DIRTY;
+       else
+               flags2 = 0;
+
        for (; addr != end; addr += PAGE_SIZE) {
                int offset = (addr & ~hmask) >> PAGE_SHIFT;
-               huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
+               huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
                err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        return err;
index d2d59b4149d06536d552d402cc872186f6f5ac74..dce24569f8fcb28cc10792db5c9f3b58b4f8e7f6 100644 (file)
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ARCH_1      0x01000000      /* Architecture-specific flag */
 #define VM_DONTDUMP    0x04000000      /* Do not include in the core dump */
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+# define VM_SOFTDIRTY  0x08000000      /* Not soft dirty clean area */
+#else
+# define VM_SOFTDIRTY  0
+#endif
+
 #define VM_MIXEDMAP    0x10000000      /* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE    0x20000000      /* MADV_HUGEPAGE marked this vma */
 #define VM_NOHUGEPAGE  0x40000000      /* MADV_NOHUGEPAGE marked this vma */
index 13926a5a6901c94a61c376c365bfa341c9bcebdc..51958d192a48ba9ec0636eb86f5c1dfc995f5433 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,6 +1609,15 @@ out:
        if (file)
                uprobe_mmap(vma);
 
+       /*
+        * New (or expanded) vma always get soft dirty status.
+        * Otherwise user-space soft-dirty page tracker won't
+        * be able to distinguish situation when vma area unmapped,
+        * then new mapped in-place (which must be aimed as
+        * a completely new data area).
+        */
+       vma->vm_flags |= VM_SOFTDIRTY;
+
        return addr;
 
 unmap_and_free_vma:
@@ -2652,6 +2661,7 @@ out:
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
+       vma->vm_flags |= VM_SOFTDIRTY;
        return addr;
 }
 
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_start = addr;
        vma->vm_end = addr + len;
 
-       vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+       vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
        vma->vm_ops = &special_mapping_vmops;