[PATCH] md: improve locking on 'safemode' and move superblock writes
authorNeilBrown <neilb@cse.unsw.edu.au>
Wed, 22 Jun 2005 00:17:12 +0000 (17:17 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Wed, 22 Jun 2005 02:07:43 +0000 (19:07 -0700)
When md marks the superblock dirty before a write, it calls
generic_make_request (to write the superblock) from within
generic_make_request (to write the first dirty block), which could cause
problems later.

With this patch, the superblock write is always done by the helper thread, and
write request are delayed until that write completes.

Also, the locking around marking the array dirty and writing the superblock is
improved to avoid possible races.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/md/raid6main.c
include/linux/raid/md.h
include/linux/raid/md_k.h

index c842e34d850eea6a89a9f54e9ed53b1fa7b6087c..177d2a7d7cea21bf57cb1afdad76f8fa30ff3aa6 100644 (file)
@@ -218,6 +218,8 @@ static mddev_t * mddev_find(dev_t unit)
        INIT_LIST_HEAD(&new->all_mddevs);
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
+       bio_list_init(&new->write_list);
+       spin_lock_init(&new->write_lock);
 
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -1251,9 +1253,11 @@ static void md_update_sb(mddev_t * mddev)
        int err, count = 100;
        struct list_head *tmp;
        mdk_rdev_t *rdev;
+       int sync_req;
 
-       mddev->sb_dirty = 0;
 repeat:
+       spin_lock(&mddev->write_lock);
+       sync_req = mddev->in_sync;
        mddev->utime = get_seconds();
        mddev->events ++;
 
@@ -1272,8 +1276,12 @@ repeat:
         * do not write anything to disk if using
         * nonpersistent superblocks
         */
-       if (!mddev->persistent)
+       if (!mddev->persistent) {
+               mddev->sb_dirty = 0;
+               spin_unlock(&mddev->write_lock);
                return;
+       }
+       spin_unlock(&mddev->write_lock);
 
        dprintk(KERN_INFO 
                "md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1304,6 +1312,15 @@ repeat:
                printk(KERN_ERR \
                        "md: excessive errors occurred during superblock update, exiting\n");
        }
+       spin_lock(&mddev->write_lock);
+       if (mddev->in_sync != sync_req) {
+               /* have to write it out again */
+               spin_unlock(&mddev->write_lock);
+               goto repeat;
+       }
+       mddev->sb_dirty = 0;
+       spin_unlock(&mddev->write_lock);
+
 }
 
 /*
@@ -3178,19 +3195,31 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
 }
 
 
-void md_write_start(mddev_t *mddev)
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, queue bi for later writing
+ * and return 0, else return 1 and it will be written now
+ */
+int md_write_start(mddev_t *mddev, struct bio *bi)
 {
-       if (!atomic_read(&mddev->writes_pending)) {
-               mddev_lock_uninterruptible(mddev);
-               if (mddev->in_sync) {
-                       mddev->in_sync = 0;
-                       del_timer(&mddev->safemode_timer);
-                       md_update_sb(mddev);
-               }
-               atomic_inc(&mddev->writes_pending);
-               mddev_unlock(mddev);
-       } else
-               atomic_inc(&mddev->writes_pending);
+       if (bio_data_dir(bi) != WRITE)
+               return 1;
+
+       atomic_inc(&mddev->writes_pending);
+       spin_lock(&mddev->write_lock);
+       if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
+               spin_unlock(&mddev->write_lock);
+               return 1;
+       }
+       bio_list_add(&mddev->write_list, bi);
+
+       if (mddev->in_sync) {
+               mddev->in_sync = 0;
+               mddev->sb_dirty = 1;
+       }
+       spin_unlock(&mddev->write_lock);
+       md_wakeup_thread(mddev->thread);
+       return 0;
 }
 
 void md_write_end(mddev_t *mddev)
@@ -3472,6 +3501,7 @@ void md_check_recovery(mddev_t *mddev)
                mddev->sb_dirty ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+               mddev->write_list.head ||
                (mddev->safemode == 1) ||
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -3480,7 +3510,9 @@ void md_check_recovery(mddev_t *mddev)
 
        if (mddev_trylock(mddev)==0) {
                int spares =0;
+               struct bio *blist;
 
+               spin_lock(&mddev->write_lock);
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
@@ -3488,9 +3520,22 @@ void md_check_recovery(mddev_t *mddev)
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
+               blist = bio_list_get(&mddev->write_list);
+               spin_unlock(&mddev->write_lock);
 
                if (mddev->sb_dirty)
                        md_update_sb(mddev);
+
+               while (blist) {
+                       struct bio *b = blist;
+                       blist = blist->bi_next;
+                       b->bi_next = NULL;
+                       generic_make_request(b);
+                       /* we already counted this, so need to un-count */
+                       md_write_end(mddev);
+               }
+
+
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
                        /* resync/recovery still happening */
index b34ad56362dffe26b41014d70c336ea1df1b649f..3f1280bbaf39e220cf674bd838dbc90e8f2f7403 100644 (file)
@@ -530,6 +530,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
         * thread has put up a bar for new requests.
         * Continue immediately if no resync is active currently.
         */
+       if (md_write_start(mddev, bio)==0)
+               return 0;
        spin_lock_irq(&conf->resync_lock);
        wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
        conf->nr_pending++;
@@ -611,7 +613,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
        rcu_read_unlock();
 
        atomic_set(&r1_bio->remaining, 1);
-       md_write_start(mddev);
+
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
                if (!r1_bio->bios[i])
index 9ae21504db8a4c6df6ab25c50666392898d14a27..bfc9f52f0ecf1fad6ab6f6718596de8ce9f83a17 100644 (file)
@@ -700,6 +700,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
                return 0;
        }
 
+       if (md_write_start(mddev, bio) == 0)
+               return 0;
+
        /*
         * Register the new request and wait if the reconstruction
         * thread has put up a bar for new requests.
@@ -774,7 +777,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
        rcu_read_unlock();
 
        atomic_set(&r10_bio->remaining, 1);
-       md_write_start(mddev);
+
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
index 63b1c59d36ff9f5c950e3c6a5e4e4135a9360bf2..677ce49078daf587123b5e04c2dca9013552f15e 100644 (file)
@@ -1411,6 +1411,9 @@ static int make_request (request_queue_t *q, struct bio * bi)
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
 
+       if (md_write_start(mddev, bi)==0)
+               return 0;
+
        if (bio_data_dir(bi)==WRITE) {
                disk_stat_inc(mddev->gendisk, writes);
                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1423,8 +1426,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
        last_sector = bi->bi_sector + (bi->bi_size>>9);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
-       if ( bio_data_dir(bi) == WRITE )
-               md_write_start(mddev);
+
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                
index 9d0e0e42a3be205a5e6028ebac76f857f2bdaed3..fede16c4e8f36d287117f2c702f0494790561df3 100644 (file)
@@ -1570,6 +1570,9 @@ static int make_request (request_queue_t *q, struct bio * bi)
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
 
+       if (md_write_start(mddev, bi)==0)
+               return 0;
+
        if (bio_data_dir(bi)==WRITE) {
                disk_stat_inc(mddev->gendisk, writes);
                disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1583,8 +1586,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
 
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
-       if ( bio_data_dir(bi) == WRITE )
-               md_write_start(mddev);
+
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
 
index a6a67d102bfa67e1a49c4d875889f40bf2a167bf..cfde8f497d6d094c3662533466445edb15b473c3 100644 (file)
@@ -69,7 +69,7 @@ extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
 extern void md_unregister_thread (mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev);
+extern int md_write_start(mddev_t *mddev, struct bio *bi);
 extern void md_write_end(mddev_t *mddev);
 extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
index c9a0d4013be7c0273a17424b0820cdd16e945fc8..d92db54255a356bcd610ce074ea2e4a5e5747fac 100644 (file)
@@ -15,6 +15,9 @@
 #ifndef _MD_K_H
 #define _MD_K_H
 
+/* and dm-bio-list.h is not under include/linux because.... ??? */
+#include "../../../drivers/md/dm-bio-list.h"
+
 #define MD_RESERVED       0UL
 #define LINEAR            1UL
 #define RAID0             2UL
@@ -252,6 +255,10 @@ struct mddev_s
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
        sector_t                        recovery_cp;
+
+       spinlock_t                      write_lock;
+       struct bio_list                 write_list;
+
        unsigned int                    safemode;       /* if set, update "clean" superblock
                                                         * when no writes pending.
                                                         */