mm,fork: introduce MADV_WIPEONFORK
authorRik van Riel <riel@redhat.com>
Wed, 6 Sep 2017 23:25:15 +0000 (16:25 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 00:27:30 +0000 (17:27 -0700)
Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty
in the child process after fork.  This differs from MADV_DONTFORK in one
important way.

If a child process accesses memory that was MADV_WIPEONFORK, it will get
zeroes.  The address ranges are still valid, they are just empty.

If a child process accesses memory that was MADV_DONTFORK, it will get a
segmentation fault, since those address ranges are no longer valid in
the child after fork.

Since MADV_DONTFORK also seems to be used to allow very large programs
to fork in systems with strict memory overcommit restrictions, changing
the semantics of MADV_DONTFORK might break existing programs.

MADV_WIPEONFORK only works on private, anonymous VMAs.

The use case is libraries that store or cache information, and want to
know that they need to regenerate it in the child process after fork.

Examples of this would be:
 - systemd/pulseaudio API checks (fail after fork) (replacing a getpid
   check, which is too slow without a PID cache)
 - PKCS#11 API reinitialization check (mandated by specification)
 - glibc's upcoming PRNG (reseed after fork)
 - OpenSSL PRNG (reseed after fork)

The security benefits of a forking server having a re-inialized PRNG in
every child process are pretty obvious.  However, due to libraries
having all kinds of internal state, and programs getting compiled with
many different versions of each library, it is unreasonable to expect
calling programs to re-initialize everything manually after fork.

A further complication is the proliferation of clone flags, programs
bypassing glibc's functions to call clone directly, and programs calling
unshare, causing the glibc pthread_atfork hook to not get called.

It would be better to have the kernel take care of this automatically.

The patch also adds MADV_KEEPONFORK, to undo the effects of a prior
MADV_WIPEONFORK.

This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO:

    https://man.openbsd.org/minherit.2

[akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines]
Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Florian Weimer <fweimer@redhat.com>
Reported-by: Colm MacCártaigh <colm@allcosts.net>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
arch/alpha/include/uapi/asm/mman.h
arch/mips/include/uapi/asm/mman.h
arch/parisc/include/uapi/asm/mman.h
arch/xtensa/include/uapi/asm/mman.h
fs/proc/task_mmu.c
include/linux/mm.h
include/trace/events/mmflags.h
include/uapi/asm-generic/mman-common.h
kernel/fork.c
mm/madvise.c

index 13b52aad3c43c42491c65cc71b021bc3a7a83762..3b26cc62dadb08d64b2d91780a92fd5458da76b6 100644 (file)
@@ -64,6 +64,9 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP    17              /* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18             /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 398eebcc3541473bad680ee587dbca450dfcc3ba..da3216007fe0ccac2e0695c10b4e1cac1ada8b9d 100644 (file)
@@ -91,6 +91,9 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP    17              /* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18             /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index b87fbe3f338adf3950bc6c67feeda77ee7bacbe5..775b5d5e41a1db26358b1457290a499d442bee71 100644 (file)
@@ -57,6 +57,9 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP    70              /* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 71             /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 72             /* Undo MADV_WIPEONFORK */
+
 #define MADV_HWPOISON     100          /* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101          /* soft offline page for testing */
 
index 8ce77a2e9babffb5d171b78b5aacb2712b5dc458..b15b278aa314b4923adfb856fad1e5925eedc83b 100644 (file)
                                           overrides the coredump filter bits */
 #define MADV_DODUMP    17              /* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18             /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index b2330aedc63f01ef36988295a655899be74ffb2a..a290966f91eccf57dd82d22941026490b99e3d7c 100644 (file)
@@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
                [ilog2(VM_ARCH_1)]      = "ar",
+               [ilog2(VM_WIPEONFORK)]  = "wf",
                [ilog2(VM_DONTDUMP)]    = "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
                [ilog2(VM_SOFTDIRTY)]   = "sd",
index 9efe620320947726e10008c8ef2aaa62854aac3a..39db8e54c5d50a98ee9cf49eb5cf16e5271095a4 100644 (file)
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
 #define VM_ARCH_1      0x01000000      /* Architecture-specific flag */
-#define VM_ARCH_2      0x02000000
+#define VM_WIPEONFORK  0x02000000      /* Wipe VMA contents in child. */
 #define VM_DONTDUMP    0x04000000      /* Do not include in the core dump */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
index 8e50d01c645fc7522dc0d6f74e80098c3c36f3dc..4c2e4737d7bcc50a988e9312cc1fe7344ca491d9 100644 (file)
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle,           "idle"          )
 #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1,       "arch_1"        }
 #endif
 
-#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC_2 {VM_MPX,          "mpx"           }
-#else
-#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2,       "arch_2"        }
-#endif
-
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
 #else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle,            "idle"          )
        {VM_NORESERVE,                  "noreserve"     },              \
        {VM_HUGETLB,                    "hugetlb"       },              \
        __VM_ARCH_SPECIFIC_1                            ,               \
-       __VM_ARCH_SPECIFIC_2                            ,               \
+       {VM_WIPEONFORK,                 "wipeonfork"    },              \
        {VM_DONTDUMP,                   "dontdump"      },              \
 IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,     "softdirty"     )               \
        {VM_MIXEDMAP,                   "mixedmap"      },              \
index d248f3c335b58850f20020cf6c2c54eae50e4f2e..203268f9231e155d72307995989feab4857defe7 100644 (file)
@@ -58,6 +58,9 @@
                                           overrides the coredump filter bits */
 #define MADV_DODUMP    17              /* Clear the MADV_DONTDUMP flag */
 
+#define MADV_WIPEONFORK 18             /* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 7ed64600da6c2206fb018968e48a07984d94b903..24a4c0be80d5abb30d5ba65f3b13eb05ff2d7690 100644 (file)
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
-               if (anon_vma_fork(tmp, mpnt))
+               if (tmp->vm_flags & VM_WIPEONFORK) {
+                       /* VM_WIPEONFORK gets a clean slate in the child. */
+                       tmp->anon_vma = NULL;
+                       if (anon_vma_prepare(tmp))
+                               goto fail_nomem_anon_vma_fork;
+               } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                rb_parent = &tmp->vm_rb;
 
                mm->map_count++;
-               retval = copy_page_range(mm, oldmm, mpnt);
+               if (!(tmp->vm_flags & VM_WIPEONFORK))
+                       retval = copy_page_range(mm, oldmm, mpnt);
 
                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);
index 4d7d1e5ddba9d9b26583b6ca9f05774247d9b1d8..eea1c733286fba68fa7ed540aa77a1ea7f0c8f59 100644 (file)
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
                }
                new_flags &= ~VM_DONTCOPY;
                break;
+       case MADV_WIPEONFORK:
+               /* MADV_WIPEONFORK is only supported on anonymous memory. */
+               if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               new_flags |= VM_WIPEONFORK;
+               break;
+       case MADV_KEEPONFORK:
+               new_flags &= ~VM_WIPEONFORK;
+               break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
+       case MADV_WIPEONFORK:
+       case MADV_KEEPONFORK:
 #ifdef CONFIG_MEMORY_FAILURE
        case MADV_SOFT_OFFLINE:
        case MADV_HWPOISON: