mm: fix potential data race in SyS_swapon
authorHugh Dickins <hughd@google.com>
Tue, 18 Aug 2015 00:34:27 +0000 (17:34 -0700)
committerAl Viro <viro@zeniv.linux.org.uk>
Fri, 21 Aug 2015 06:33:07 +0000 (02:33 -0400)
While running KernelThreadSanitizer (ktsan) on upstream kernel with
trinity, we got a few reports from SyS_swapon, here is one of them:

Read of size 8 by thread T307 (K7621):
 [<     inlined    >] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [<ffffffff812242c0>] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [<ffffffff81e97c8a>] ia32_do_call+0x1b/0x25

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401: q->swap_file may be reset to
NULL by another thread before it is dereferenced for f_mapping.

But why is that iteration needed at all?  Doesn't the claim_swapfile()
which follows do all that is needed to check for a duplicate entry -
FMODE_EXCL on a bdev, testing IS_SWAPFILE under i_mutex on a regfile?

Well, not quite: bd_may_claim() allows the same "holder" to claim the
bdev again, so we do need to use a different holder than "sys_swapon";
and we should not replace appropriate -EBUSY by inappropriate -EINVAL.

Index i was reused in a cpu loop further down: renamed cpu there.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
mm/swapfile.c

index 41e4581af7c512fe49ad8a1677a501f290e14807..aebc2dd6e64975e2589429a0e22929f96c4d7532 100644 (file)
@@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
        if (S_ISBLK(inode->i_mode)) {
                p->bdev = bdgrab(I_BDEV(inode));
                error = blkdev_get(p->bdev,
-                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-                                  sys_swapon);
+                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
                if (error < 0) {
                        p->bdev = NULL;
-                       return -EINVAL;
+                       return error;
                }
                p->old_block_size = block_size(p->bdev);
                error = set_blocksize(p->bdev, PAGE_SIZE);
@@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
-       int i;
        int prio;
        int error;
        union swap_header *swap_header;
@@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
        p->swap_file = swap_file;
        mapping = swap_file->f_mapping;
-
-       for (i = 0; i < nr_swapfiles; i++) {
-               struct swap_info_struct *q = swap_info[i];
-
-               if (q == p || !q->swap_file)
-                       continue;
-               if (mapping == q->swap_file->f_mapping) {
-                       error = -EBUSY;
-                       goto bad_swap;
-               }
-       }
-
        inode = mapping->host;
+
        /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
        error = claim_swapfile(p, inode);
        if (unlikely(error))
@@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                goto bad_swap;
        }
        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+               int cpu;
+
                p->flags |= SWP_SOLIDSTATE;
                /*
                 * select a random position to start with to help wear leveling
@@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        error = -ENOMEM;
                        goto bad_swap;
                }
-               for_each_possible_cpu(i) {
+               for_each_possible_cpu(cpu) {
                        struct percpu_cluster *cluster;
-                       cluster = per_cpu_ptr(p->percpu_cluster, i);
+                       cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        cluster_set_null(&cluster->index);
                }
        }