dm raid: add support for the MD RAID0 personality
authorHeinz Mauelshagen <heinzm@redhat.com>
Wed, 29 Apr 2015 12:03:04 +0000 (14:03 +0200)
committerMike Snitzer <snitzer@redhat.com>
Fri, 29 May 2015 18:19:00 +0000 (14:19 -0400)
Add dm-raid access to the MD RAID0 personality to enable single zone
striping.

The following changes enable that access:
- add type definition to raid_types array
- make bitmap creation conditonal in super_validate(), because
  bitmaps are not allowed in raid0
- set rdev->sectors to the data image size in super_validate()
  to allow the raid0 personality to calculate the MD array
  size properly
- use mdddev(un)lock() functions instead of direct mutex_(un)lock()
  (wrapped in here because it's a trivial change)
- enhance raid_status() to always report full sync for raid0
  so that userspace checks for 100% sync will succeed and allow
  for resize (and takeover/reshape once added in future paches)
- enhance raid_resume() to not load bitmap in case of raid0
- add merge function to avoid data corruption (seen with readahead)
  that resulted from bio payloads that grew too large.  This problem
  did not occur with the other raid levels because it either did not
  apply without striping (raid1) or was avoided via stripe caching.
- raise version to 1.7.0 because of the raid0 API change

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Documentation/device-mapper/dm-raid.txt
drivers/md/dm-raid.c

index 85b74b15fa9c47375e4fe05ee72c0196e806e9b9..cb12af3b51c21b3b11674fb9b15312c9cee827cb 100644 (file)
@@ -225,3 +225,4 @@ Version History
 1.5.1   Add ability to restore transiently failed devices on resume.
 1.5.2   'mismatch_cnt' is zero unless [last_]sync_action is "check".
 1.6.0   Add discard support (and devices_handle_discard_safely module param).
+1.7.0   Add support for MD RAID0 mappings.
index af49ddebaa62508267b5a64a4535b1b2e60e01f4..2daa6779351179974f0a706f52de57dd722369ef 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -82,6 +82,7 @@ static struct raid_type {
        const unsigned level;           /* RAID level. */
        const unsigned algorithm;       /* RAID algorithm. */
 } raid_types[] = {
+       {"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
        {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
        {"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
@@ -719,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                rs->md.layout = raid10_format_to_md_layout(raid10_format,
                                                           raid10_copies);
                rs->md.new_layout = rs->md.layout;
-       } else if ((rs->raid_type->level > 1) &&
+       } else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
                   sector_div(sectors_per_dev,
                              (rs->md.raid_disks - rs->raid_type->parity_devs))) {
                rs->ti->error = "Target length not divisible by number of data devices";
@@ -1025,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        return 0;
 }
 
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
+       struct mddev *mddev = &rs->md;
        struct dm_raid_superblock *sb = page_address(rdev->sb_page);
 
        /*
@@ -1036,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
        if (!mddev->events && super_init_validation(mddev, rdev))
                return -EINVAL;
 
-       mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
-       rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+       /* Enable bitmap creation for RAID levels != 0 */
+       mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+       rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
        if (!test_bit(FirstUse, &rdev->flags)) {
                rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
                if (rdev->recovery_offset != MaxSector)
@@ -1081,6 +1085,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                 * that the "sync" directive is disallowed during the
                 * reshape.
                 */
+               rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
                if (rs->ctr_flags & CTR_FLAG_SYNC)
                        continue;
 
@@ -1139,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
         * validation for the remaining devices.
         */
        ti->error = "Unable to assemble array: Invalid superblocks";
-       if (super_validate(mddev, freshest))
+       if (super_validate(rs, freshest))
                return -EINVAL;
 
        rdev_for_each(rdev, mddev)
-               if ((rdev != freshest) && super_validate(mddev, rdev))
+               if ((rdev != freshest) && super_validate(rs, rdev))
                        return -EINVAL;
 
        return 0;
@@ -1281,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
         */
        configure_discard_support(ti, rs);
 
-       mutex_lock(&rs->md.reconfig_mutex);
+       /* Has to be held on running the array */
+       mddev_lock_nointr(&rs->md);
        ret = md_run(&rs->md);
        rs->md.in_sync = 0; /* Assume already marked dirty */
-       mutex_unlock(&rs->md.reconfig_mutex);
+       mddev_unlock(&rs->md);
 
        if (ret) {
                ti->error = "Fail to run raid array";
@@ -1367,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        case STATUSTYPE_INFO:
                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
 
-               if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-                       sync = rs->md.curr_resync_completed;
-               else
-                       sync = rs->md.recovery_cp;
-
-               if (sync >= rs->md.resync_max_sectors) {
-                       /*
-                        * Sync complete.
-                        */
+               if (rs->raid_type->level) {
+                       if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                               sync = rs->md.curr_resync_completed;
+                       else
+                               sync = rs->md.recovery_cp;
+
+                       if (sync >= rs->md.resync_max_sectors) {
+                               /*
+                                * Sync complete.
+                                */
+                               array_in_sync = 1;
+                               sync = rs->md.resync_max_sectors;
+                       } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+                               /*
+                                * If "check" or "repair" is occurring, the array has
+                                * undergone and initial sync and the health characters
+                                * should not be 'a' anymore.
+                                */
+                               array_in_sync = 1;
+                       } else {
+                               /*
+                                * The array may be doing an initial sync, or it may
+                                * be rebuilding individual components.  If all the
+                                * devices are In_sync, then it is the array that is
+                                * being initialized.
+                                */
+                               for (i = 0; i < rs->md.raid_disks; i++)
+                                       if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                               array_in_sync = 1;
+                       }
+               } else {
+                       /* RAID0 */
                        array_in_sync = 1;
                        sync = rs->md.resync_max_sectors;
-               } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-                       /*
-                        * If "check" or "repair" is occurring, the array has
-                        * undergone and initial sync and the health characters
-                        * should not be 'a' anymore.
-                        */
-                       array_in_sync = 1;
-               } else {
-                       /*
-                        * The array may be doing an initial sync, or it may
-                        * be rebuilding individual components.  If all the
-                        * devices are In_sync, then it is the array that is
-                        * being initialized.
-                        */
-                       for (i = 0; i < rs->md.raid_disks; i++)
-                               if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-                                       array_in_sync = 1;
                }
 
                /*
@@ -1683,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
 
-       set_bit(MD_CHANGE_DEVS, &rs->md.flags);
-       if (!rs->bitmap_loaded) {
-               bitmap_load(&rs->md);
-               rs->bitmap_loaded = 1;
-       } else {
-               /*
-                * A secondary resume while the device is active.
-                * Take this opportunity to check whether any failed
-                * devices are reachable again.
-                */
-               attempt_restore_of_faulty_devices(rs);
+       if (rs->raid_type->level) {
+               set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+               if (!rs->bitmap_loaded) {
+                       bitmap_load(&rs->md);
+                       rs->bitmap_loaded = 1;
+               } else {
+                       /*
+                        * A secondary resume while the device is active.
+                        * Take this opportunity to check whether any failed
+                        * devices are reachable again.
+                        */
+                       attempt_restore_of_faulty_devices(rs);
+               }
+
+               clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        }
 
-       clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        mddev_resume(&rs->md);
 }
 
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                     struct bio_vec *biovec, int max_size)
+{
+       struct raid_set *rs = ti->private;
+       struct md_personality *pers = rs->md.pers;
+
+       if (pers && pers->mergeable_bvec)
+               return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+       /*
+        * In case we can't request the personality because
+        * the raid set is not running yet
+        *
+        * -> return safe minimum
+        */
+       return rs->md.chunk_sectors;
+}
+
 static struct target_type raid_target = {
        .name = "raid",
-       .version = {1, 6, 0},
+       .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
@@ -1714,6 +1749,7 @@ static struct target_type raid_target = {
        .presuspend = raid_presuspend,
        .postsuspend = raid_postsuspend,
        .resume = raid_resume,
+       .merge = raid_merge,
 };
 
 static int __init dm_raid_init(void)