md: set MD_CHANGE_PENDING in a atomic region
authorGuoqing Jiang <gqjiang@suse.com>
Wed, 4 May 2016 02:22:13 +0000 (22:22 -0400)
committerShaohua Li <shli@fb.com>
Mon, 9 May 2016 16:24:02 +0000 (09:24 -0700)
Some code waits for a metadata update by:

1. flagging that it is needed (MD_CHANGE_DEVS or MD_CHANGE_CLEAN)
2. setting MD_CHANGE_PENDING and waking the management thread
3. waiting for MD_CHANGE_PENDING to be cleared

If the first two are done without locking, the code in md_update_sb()
which checks if it needs to repeat might test if an update is needed
before step 1, then clear MD_CHANGE_PENDING after step 2, resulting
in the wait returning early.

So make sure all places that set MD_CHANGE_PENDING are atomicial, and
bit_clear_unless (suggested by Neil) is introduced for the purpose.

Cc: Martin Kepplinger <martink@posteo.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: <linux-kernel@vger.kernel.org>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5-cache.c
drivers/md/raid5.c
include/linux/bitops.h

index 23c6d732a374a43b009fb46a20bf9784a87f2ae9..a79462dcd5e1a7dc53fb21034f456f0f1dcae802 100644 (file)
@@ -2295,12 +2295,16 @@ repeat:
        if (mddev_is_clustered(mddev)) {
                if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
                        force_change = 1;
+               if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+                       nospares = 1;
                ret = md_cluster_ops->metadata_update_start(mddev);
                /* Has someone else has updated the sb */
                if (!does_sb_need_changing(mddev)) {
                        if (ret == 0)
                                md_cluster_ops->metadata_update_cancel(mddev);
-                       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+                       bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+                                                        BIT(MD_CHANGE_DEVS) |
+                                                        BIT(MD_CHANGE_CLEAN));
                        return;
                }
        }
@@ -2434,15 +2438,11 @@ repeat:
        if (mddev_is_clustered(mddev) && ret == 0)
                md_cluster_ops->metadata_update_finish(mddev);
 
-       spin_lock(&mddev->lock);
        if (mddev->in_sync != sync_req ||
-           test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+           !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+                              BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN)))
                /* have to write it out again */
-               spin_unlock(&mddev->lock);
                goto repeat;
-       }
-       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-       spin_unlock(&mddev->lock);
        wake_up(&mddev->sb_wait);
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -8147,18 +8147,18 @@ void md_do_sync(struct md_thread *thread)
                }
        }
  skip:
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-
        if (mddev_is_clustered(mddev) &&
            ret == 0) {
                /* set CHANGE_PENDING here since maybe another
                 * update is needed, so other nodes are informed */
-               set_bit(MD_CHANGE_PENDING, &mddev->flags);
+               set_mask_bits(&mddev->flags, 0,
+                             BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
                md_cluster_ops->resync_finish(mddev);
-       }
+       } else
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        spin_lock(&mddev->lock);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8550,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape);
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
                       int is_new)
 {
+       struct mddev *mddev = rdev->mddev;
        int rv;
        if (is_new)
                s += rdev->new_data_offset;
@@ -8559,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
        if (rv == 0) {
                /* Make sure they get written out promptly */
                sysfs_notify_dirent_safe(rdev->sysfs_state);
-               set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
-               set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
+               set_mask_bits(&mddev->flags, 0,
+                             BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING));
                md_wakeup_thread(rdev->mddev->thread);
                return 1;
        } else
index a7f2b9c9f8a06fa84aa5f46d6c44c5760894502b..c7c8cde0ab21128527cd74a1d7df4a0e388dc718 100644 (file)
@@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
         * if recovery is running, make sure it aborts.
         */
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-       set_bit(MD_CHANGE_PENDING, &mddev->flags);
+       set_mask_bits(&mddev->flags, 0,
+                     BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
        printk(KERN_ALERT
               "md/raid1:%s: Disk failure on %s, disabling device.\n"
               "md/raid1:%s: Operation continuing on %d devices.\n",
index 84e24e648165e0008f09fe81c38752ce9b244420..c7de2a53e6259499dc18f64d7783d3cacad5c029 100644 (file)
@@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
                bio->bi_iter.bi_sector < conf->reshape_progress))) {
                /* Need to update reshape_position in metadata */
                mddev->reshape_position = conf->reshape_progress;
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
-               set_bit(MD_CHANGE_PENDING, &mddev->flags);
+               set_mask_bits(&mddev->flags, 0,
+                             BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(Blocked, &rdev->flags);
        set_bit(Faulty, &rdev->flags);
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-       set_bit(MD_CHANGE_PENDING, &mddev->flags);
+       set_mask_bits(&mddev->flags, 0,
+                     BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
        spin_unlock_irqrestore(&conf->device_lock, flags);
        printk(KERN_ALERT
               "md/raid10:%s: Disk failure on %s, disabling device.\n"
index 9531f5f05b93df22d0c1d52381cf0a841173403a..ac51bc5ecb16539aa7ea40382cc804a0778ba45f 100644 (file)
@@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
         * in_teardown check workaround this issue.
         */
        if (!log->in_teardown) {
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
-               set_bit(MD_CHANGE_PENDING, &mddev->flags);
+               set_mask_bits(&mddev->flags, 0,
+                             BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                        !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
index 4d31b235a888b5dc979cf585bc74cda78bbaeb06..8959e6dd31dd1c056f8ef6e89d466738a1ade881 100644 (file)
@@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
        set_bit(Blocked, &rdev->flags);
        set_bit(Faulty, &rdev->flags);
-       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-       set_bit(MD_CHANGE_PENDING, &mddev->flags);
+       set_mask_bits(&mddev->flags, 0,
+                     BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
        printk(KERN_ALERT
               "md/raid:%s: Disk failure on %s, disabling device.\n"
               "md/raid:%s: Operation continuing on %d devices.\n",
index defeaac0745f1b26340d7a13f5b73810de743193..299e76b59fe9b0330254f608863f7237ed97c802 100644 (file)
@@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word)
 })
 #endif
 
+#ifndef bit_clear_unless
+#define bit_clear_unless(ptr, _clear, _test)   \
+({                                                             \
+       const typeof(*ptr) clear = (_clear), test = (_test);    \
+       typeof(*ptr) old, new;                                  \
+                                                               \
+       do {                                                    \
+               old = ACCESS_ONCE(*ptr);                        \
+               new = old & ~clear;                             \
+       } while (!(old & test) &&                               \
+                cmpxchg(ptr, old, new) != old);                \
+                                                               \
+       !(old & test);                                          \
+})
+#endif
+
 #ifndef find_last_bit
 /**
  * find_last_bit - find the last set bit in a memory region