raid5-cache: add journal hot add/remove support
authorShaohua Li <shli@fb.com>
Sun, 20 Dec 2015 23:51:02 +0000 (10:51 +1100)
committerNeilBrown <neilb@suse.com>
Wed, 6 Jan 2016 00:39:57 +0000 (11:39 +1100)
Add support for journal disk hot add/remove. Mostly trival checks in md
part. The raid5 part is a little tricky. For hot-remove, we can't wait
pending write as it's called from raid5d. The wait will cause deadlock.
We simplily fail the hot-remove. A hot-remove retry can success
eventually since if journal disk is faulty all pending write will be
failed and finish. For hot-add, since an array supporting journal but
without journal disk will be marked read-only, we are safe to hot add
journal without stopping IO (should be read IO, while journal only
handles write IO).

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
drivers/md/md.c
drivers/md/raid5-cache.c
drivers/md/raid5.c

index d0f0621bf9b0a8a81599098a208a5724719689ac..c0c3e6dec2484e0a15736a7c494e9fd95bbe310a 100644 (file)
@@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
                return -EEXIST;
 
        /* make sure rdev->sectors exceeds mddev->dev_sectors */
-       if (rdev->sectors && (mddev->dev_sectors == 0 ||
-                       rdev->sectors < mddev->dev_sectors)) {
+       if (!test_bit(Journal, &rdev->flags) &&
+           rdev->sectors &&
+           (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
                if (mddev->pers) {
                        /* Cannot change size, so fail
                         * If mddev->level <= 0, then we don't care
@@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
                }
        }
        rcu_read_unlock();
-       if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+       if (!test_bit(Journal, &rdev->flags) &&
+           mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
                printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
                       mdname(mddev), mddev->max_disks);
                return -EBUSY;
@@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
                else
                        clear_bit(WriteMostly, &rdev->flags);
 
-               if (info->state & (1<<MD_DISK_JOURNAL))
+               if (info->state & (1<<MD_DISK_JOURNAL)) {
+                       struct md_rdev *rdev2;
+                       bool has_journal = false;
+
+                       /* make sure no existing journal disk */
+                       rdev_for_each(rdev2, mddev) {
+                               if (test_bit(Journal, &rdev2->flags)) {
+                                       has_journal = true;
+                                       break;
+                               }
+                       }
+                       if (has_journal) {
+                               export_rdev(rdev);
+                               return -EBUSY;
+                       }
                        set_bit(Journal, &rdev->flags);
+               }
                /*
                 * check whether the device shows up in other nodes
                 */
@@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev,
                        continue;
                if (test_bit(Faulty, &rdev->flags))
                        continue;
-               if (test_bit(Journal, &rdev->flags))
-                       continue;
-               if (mddev->ro &&
-                   ! (rdev->saved_raid_disk >= 0 &&
-                      !test_bit(Bitmap_sync, &rdev->flags)))
-                       continue;
+               if (!test_bit(Journal, &rdev->flags)) {
+                       if (mddev->ro &&
+                           ! (rdev->saved_raid_disk >= 0 &&
+                              !test_bit(Bitmap_sync, &rdev->flags)))
+                               continue;
 
-               rdev->recovery_offset = 0;
+                       rdev->recovery_offset = 0;
+               }
                if (mddev->pers->
                    hot_add_disk(mddev, rdev) == 0) {
                        if (sysfs_link_rdev(mddev, rdev))
                                /* failure here is OK */;
-                       spares++;
+                       if (!test_bit(Journal, &rdev->flags))
+                               spares++;
                        md_new_event(mddev);
                        set_bit(MD_CHANGE_DEVS, &mddev->flags);
                }
index 668e973f07e66302303d687204c4cdebec8b94d9..c1c4d213a2c254ad349f9c9b1302dffe90d1e18b 100644 (file)
@@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
 
 bool r5l_log_disk_error(struct r5conf *conf)
 {
+       struct r5l_log *log;
+       bool ret;
        /* don't allow write if journal disk is missing */
-       if (!conf->log)
-               return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
-       return test_bit(Faulty, &conf->log->rdev->flags);
+       rcu_read_lock();
+       log = rcu_dereference(conf->log);
+
+       if (!log)
+               ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+       else
+               ret = test_bit(Faulty, &log->rdev->flags);
+       rcu_read_unlock();
+       return ret;
 }
 
 struct r5l_recovery_ctx {
@@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
        if (r5l_load_log(log))
                goto error;
 
-       conf->log = log;
+       rcu_assign_pointer(conf->log, log);
        return 0;
 error:
        md_unregister_thread(&log->reclaim_thread);
index 22362505f810fcd9825ea4683bcf1d40285a8208..a086014dcd49915d5dad8b2c57b9bbfa30b39e5c 100644 (file)
@@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
        struct disk_info *p = conf->disks + number;
 
        print_raid5_conf(conf);
-       if (test_bit(Journal, &rdev->flags)) {
+       if (test_bit(Journal, &rdev->flags) && conf->log) {
+               struct r5l_log *log;
                /*
-                * journal disk is not removable, but we need give a chance to
-                * update superblock of other disks. Otherwise journal disk
-                * will be considered as 'fresh'
+                * we can't wait pending write here, as this is called in
+                * raid5d, wait will deadlock.
                 */
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
-               return -EINVAL;
+               if (atomic_read(&mddev->writes_pending))
+                       return -EBUSY;
+               log = conf->log;
+               conf->log = NULL;
+               synchronize_rcu();
+               r5l_exit_log(log);
+               return 0;
        }
        if (rdev == p->rdev)
                rdevp = &p->rdev;
@@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
 
-       if (test_bit(Journal, &rdev->flags))
-               return -EINVAL;
+       if (test_bit(Journal, &rdev->flags)) {
+               char b[BDEVNAME_SIZE];
+               if (conf->log)
+                       return -EBUSY;
+
+               rdev->raid_disk = 0;
+               /*
+                * The array is in readonly mode if journal is missing, so no
+                * write requests running. We should be safe
+                */
+               r5l_init_log(conf, rdev);
+               printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+                      mdname(mddev), bdevname(rdev->bdev, b));
+               return 0;
+       }
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;