[PATCH] md: Set/get state of array via sysfs
authorNeilBrown <neilb@suse.de>
Mon, 26 Jun 2006 07:27:58 +0000 (00:27 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Mon, 26 Jun 2006 16:58:39 +0000 (09:58 -0700)
This allows the state of an md/array to be directly controlled via sysfs and
adds the ability to stop and array without tearing it down.

Array states/settings:

 clear
     No devices, no size, no level
     Equivalent to STOP_ARRAY ioctl
 inactive
     May have some settings, but array is not active
        all IO results in error
     When written, doesn't tear down array, but just stops it
 suspended (not supported yet)
     All IO requests will block. The array can be reconfigured.
     Writing this, if accepted, will block until array is quiescent
 readonly
     no resync can happen.  no superblocks get written.
     write requests fail
 read-auto
     like readonly, but behaves like 'clean' on a write request.

 clean - no pending writes, but otherwise active.
     When written to inactive array, starts without resync
     If a write request arrives then
       if metadata is known, mark 'dirty' and switch to 'active'.
       if not known, block and switch to write-pending
     If written to an active array that has pending writes, then fails.
 active
     fully active: IO and resync can be happening.
     When written to inactive array, starts with resync

 write-pending (not supported yet)
     clean, but writes are blocked waiting for 'active' to be written.

 active-idle
     like active, but no writes have been seen for a while (100msec).

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Documentation/md.txt
drivers/md/md.c

index b19978e035fca1f457ab8a39bca2fbb6191309a8..df0b455157750f91b3b0c8c6e9aa089848f4af87 100644 (file)
@@ -216,6 +216,45 @@ All md devices contain:
      period as a number of seconds.  The default is 200msec (0.200).
      Writing a value of 0 disables safemode.
 
+   array_state
+     This file contains a single word which describes the current
+     state of the array.  In many cases, the state can be set by
+     writing the word for the desired state, however some states
+     cannot be explicitly set, and some transitions are not allowed.
+
+     clear
+         No devices, no size, no level
+         Writing is equivalent to STOP_ARRAY ioctl
+     inactive
+         May have some settings, but array is not active
+            all IO results in error
+         When written, doesn't tear down array, but just stops it
+     suspended (not supported yet)
+         All IO requests will block. The array can be reconfigured.
+         Writing this, if accepted, will block until array is quiessent
+     readonly
+         no resync can happen.  no superblocks get written.
+         write requests fail
+     read-auto
+         like readonly, but behaves like 'clean' on a write request.
+
+     clean - no pending writes, but otherwise active.
+         When written to inactive array, starts without resync
+         If a write request arrives then
+           if metadata is known, mark 'dirty' and switch to 'active'.
+           if not known, block and switch to write-pending
+         If written to an active array that has pending writes, then fails.
+     active
+         fully active: IO and resync can be happening.
+         When written to inactive array, starts with resync
+
+     write-pending
+         clean, but writes are blocked waiting for 'active' to be written.
+
+     active-idle
+         like active, but no writes have been seen for a while (safe_mode_delay).
+
+
    sync_speed_min
    sync_speed_max
      This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
index 34b6902cda467ae88f63618203c2d91af8a0fe8a..f6562ee4c6fc0e2495468546920f945f74c52f0f 100644 (file)
@@ -2185,6 +2185,176 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
+/*
+ * The array state can be:
+ *
+ * clear
+ *     No devices, no size, no level
+ *     Equivalent to STOP_ARRAY ioctl
+ * inactive
+ *     May have some settings, but array is not active
+ *        all IO results in error
+ *     When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ *     All IO requests will block. The array can be reconfigured.
+ *     Writing this, if accepted, will block until array is quiessent
+ * readonly
+ *     no resync can happen.  no superblocks get written.
+ *     write requests fail
+ * read-auto
+ *     like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ *     When written to inactive array, starts without resync
+ *     If a write request arrives then
+ *       if metadata is known, mark 'dirty' and switch to 'active'.
+ *       if not known, block and switch to write-pending
+ *     If written to an active array that has pending writes, then fails.
+ * active
+ *     fully active: IO and resync can be happening.
+ *     When written to inactive array, starts with resync
+ *
+ * write-pending
+ *     clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ *     like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+                  write_pending, active_idle, bad_word};
+char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+       "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (cmd_match(word, list[n]))
+                       break;
+       return n;
+}
+
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+       enum array_state st = inactive;
+
+       if (mddev->pers)
+               switch(mddev->ro) {
+               case 1:
+                       st = readonly;
+                       break;
+               case 2:
+                       st = read_auto;
+                       break;
+               case 0:
+                       if (mddev->in_sync)
+                               st = clean;
+                       else if (mddev->safemode)
+                               st = active_idle;
+                       else
+                               st = active;
+               }
+       else {
+               if (list_empty(&mddev->disks) &&
+                   mddev->raid_disks == 0 &&
+                   mddev->size == 0)
+                       st = clear;
+               else
+                       st = inactive;
+       }
+       return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       int err = -EINVAL;
+       enum array_state st = match_word(buf, array_states);
+       switch(st) {
+       case bad_word:
+               break;
+       case clear:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 0);
+               }
+               break;
+       case inactive:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 2);
+               }
+               break;
+       case suspended:
+               break; /* not supported yet */
+       case readonly:
+               if (mddev->pers)
+                       err = do_md_stop(mddev, 1);
+               else {
+                       mddev->ro = 1;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case read_auto:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       err = do_md_stop(mddev, 1);
+                       if (err == 0)
+                               mddev->ro = 2; /* FIXME mark devices writable */
+               } else {
+                       mddev->ro = 2;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case clean:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       spin_lock_irq(&mddev->write_lock);
+                       if (atomic_read(&mddev->writes_pending) == 0) {
+                               mddev->in_sync = 1;
+                               mddev->sb_dirty = 1;
+                       }
+                       spin_unlock_irq(&mddev->write_lock);
+               } else {
+                       mddev->ro = 0;
+                       mddev->recovery_cp = MaxSector;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case active:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       mddev->sb_dirty = 0;
+                       wake_up(&mddev->sb_wait);
+                       err = 0;
+               } else {
+                       mddev->ro = 0;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case write_pending:
+       case active_idle:
+               /* these cannot be set */
+               break;
+       }
+       if (err)
+               return err;
+       else
+               return len;
+}
+static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
+
 static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
@@ -2553,6 +2723,7 @@ static struct attribute *md_default_attrs[] = {
        &md_metadata.attr,
        &md_new_device.attr,
        &md_safe_delay.attr,
+       &md_array_state.attr,
        NULL,
 };
 
@@ -2919,11 +3090,8 @@ static int restart_array(mddev_t *mddev)
                md_wakeup_thread(mddev->thread);
                md_wakeup_thread(mddev->sync_thread);
                err = 0;
-       } else {
-               printk(KERN_ERR "md: %s has no personality assigned.\n",
-                       mdname(mddev));
+       } else
                err = -EINVAL;
-       }
 
 out:
        return err;
@@ -2955,7 +3123,12 @@ static void restore_bitmap_write_access(struct file *file)
        spin_unlock(&inode->i_lock);
 }
 
-static int do_md_stop(mddev_t * mddev, int ro)
+/* mode:
+ *   0 - completely stop and dis-assemble array
+ *   1 - switch to readonly
+ *   2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
 {
        int err = 0;
        struct gendisk *disk = mddev->gendisk;
@@ -2977,12 +3150,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                invalidate_partition(disk, 0);
 
-               if (ro) {
+               switch(mode) {
+               case 1: /* readonly */
                        err  = -ENXIO;
                        if (mddev->ro==1)
                                goto out;
                        mddev->ro = 1;
-               } else {
+                       break;
+               case 0: /* disassemble */
+               case 2: /* stop */
                        bitmap_flush(mddev);
                        md_super_wait(mddev);
                        if (mddev->ro)
@@ -3002,7 +3178,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                        mddev->in_sync = 1;
                        md_update_sb(mddev);
                }
-               if (ro)
+               if (mode == 1)
                        set_disk_ro(disk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        }
@@ -3010,7 +3186,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
        /*
         * Free resources if final stop
         */
-       if (!ro) {
+       if (mode == 0) {
                mdk_rdev_t *rdev;
                struct list_head *tmp;
                struct gendisk *disk;
@@ -3034,6 +3210,9 @@ static int do_md_stop(mddev_t * mddev, int ro)
                export_array(mddev);
 
                mddev->array_size = 0;
+               mddev->size = 0;
+               mddev->raid_disks = 0;
+
                disk = mddev->gendisk;
                if (disk)
                        set_capacity(disk, 0);