[PATCH] md: fix deadlock due to md thread processing delayed requests.
authorNeilBrown <neilb@cse.unsw.edu.au>
Wed, 22 Jun 2005 00:17:26 +0000 (17:17 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Wed, 22 Jun 2005 02:07:46 +0000 (19:07 -0700)
Before completing a 'write' the md superblock might need to be updated.
This is best done by the md_thread.

The current code schedules this up and queues the write request for later
handling by the md_thread.

However some personalities (Raid5/raid6) will deadlock if the md_thread
tries to submit requests to its own array.

So this patch changes things so the processes submitting the request waits
for the superblock to be written and then submits the request itself.

This fixes a recently-created deadlock in raid5/raid6

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/md/raid6main.c
include/linux/raid/md.h
include/linux/raid/md_k.h

index 789b114f860aed3e8d9c172020b4327f8176a140..7075bebb7f37015ce09ecc37d7aabd557ebfe82b 100644 (file)
@@ -224,8 +224,8 @@ static mddev_t * mddev_find(dev_t unit)
        INIT_LIST_HEAD(&new->all_mddevs);
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
-       bio_list_init(&new->write_list);
        spin_lock_init(&new->write_lock);
+       init_waitqueue_head(&new->sb_wait);
 
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -1307,6 +1307,7 @@ repeat:
        if (!mddev->persistent) {
                mddev->sb_dirty = 0;
                spin_unlock(&mddev->write_lock);
+               wake_up(&mddev->sb_wait);
                return;
        }
        spin_unlock(&mddev->write_lock);
@@ -1348,6 +1349,7 @@ repeat:
        }
        mddev->sb_dirty = 0;
        spin_unlock(&mddev->write_lock);
+       wake_up(&mddev->sb_wait);
 
 }
 
@@ -3368,29 +3370,26 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
 
 /* md_write_start(mddev, bi)
  * If we need to update some array metadata (e.g. 'active' flag
- * in superblock) before writing, queue bi for later writing
- * and return 0, else return 1 and it will be written now
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
  */
-int md_write_start(mddev_t *mddev, struct bio *bi)
+void md_write_start(mddev_t *mddev, struct bio *bi)
 {
+       DEFINE_WAIT(w);
        if (bio_data_dir(bi) != WRITE)
-               return 1;
+               return;
 
        atomic_inc(&mddev->writes_pending);
-       spin_lock(&mddev->write_lock);
-       if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
-               spin_unlock(&mddev->write_lock);
-               return 1;
-       }
-       bio_list_add(&mddev->write_list, bi);
-
        if (mddev->in_sync) {
-               mddev->in_sync = 0;
-               mddev->sb_dirty = 1;
+               spin_lock(&mddev->write_lock);
+               if (mddev->in_sync) {
+                       mddev->in_sync = 0;
+                       mddev->sb_dirty = 1;
+                       md_wakeup_thread(mddev->thread);
+               }
+               spin_unlock(&mddev->write_lock);
        }
-       spin_unlock(&mddev->write_lock);
-       md_wakeup_thread(mddev->thread);
-       return 0;
+       wait_event(mddev->sb_wait, mddev->sb_dirty==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -3685,7 +3684,6 @@ void md_check_recovery(mddev_t *mddev)
                mddev->sb_dirty ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
-               mddev->write_list.head ||
                (mddev->safemode == 1) ||
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -3694,7 +3692,6 @@ void md_check_recovery(mddev_t *mddev)
 
        if (mddev_trylock(mddev)==0) {
                int spares =0;
-               struct bio *blist;
 
                spin_lock(&mddev->write_lock);
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
@@ -3704,21 +3701,11 @@ void md_check_recovery(mddev_t *mddev)
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
-               blist = bio_list_get(&mddev->write_list);
                spin_unlock(&mddev->write_lock);
 
                if (mddev->sb_dirty)
                        md_update_sb(mddev);
 
-               while (blist) {
-                       struct bio *b = blist;
-                       blist = blist->bi_next;
-                       b->bi_next = NULL;
-                       generic_make_request(b);
-                       /* we already counted this, so need to un-count */
-                       md_write_end(mddev);
-               }
-
 
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
index 3f5234fe359330237e6998d1a0d49faaf99c7db3..98b09773e79ec2a1e636f5ee419f0d39cb9cae00 100644 (file)
@@ -561,8 +561,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
         * thread has put up a bar for new requests.
         * Continue immediately if no resync is active currently.
         */
-       if (md_write_start(mddev, bio)==0)
-               return 0;
+       md_write_start(mddev, bio); /* wait on superblock update early */
+
        spin_lock_irq(&conf->resync_lock);
        wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
        conf->nr_pending++;
index 8476515bfdc79e293ee079a6fea11e15884dcc39..fd7324a86d1344e86952759f23a0c874e5bd37fc 100644 (file)
@@ -700,8 +700,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
                return 0;
        }
 
-       if (md_write_start(mddev, bio) == 0)
-               return 0;
+       md_write_start(mddev, bio);
 
        /*
         * Register the new request and wait if the reconstruction
index 1ce3f5aaa984b57f4b20f28fe809ec2e1b23bd75..93a9726cc2d6793ed237bfa8d3f2a955da501667 100644 (file)
@@ -1411,8 +1411,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
 
-       if (md_write_start(mddev, bi)==0)
-               return 0;
+       md_write_start(mddev, bi);
 
        if (bio_data_dir(bi)==WRITE) {
                disk_stat_inc(mddev->gendisk, writes);
index d9c385496dc5f2fcf572f99fabdb834a3e9a44dd..f62ea1a73d0d9d1b87f5f10d7b91a6f6d7f48b0d 100644 (file)
@@ -1570,8 +1570,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
 
-       if (md_write_start(mddev, bi)==0)
-               return 0;
+       md_write_start(mddev, bi);
 
        if (bio_data_dir(bi)==WRITE) {
                disk_stat_inc(mddev->gendisk, writes);
index cfde8f497d6d094c3662533466445edb15b473c3..75f41d8faed2c5bafb9dd347b535e22356d00581 100644 (file)
@@ -69,7 +69,7 @@ extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
 extern void md_unregister_thread (mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
-extern int md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
 extern void md_write_end(mddev_t *mddev);
 extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
index 6cdcb4434c6ca7759e74e51ab4acccbd6949679b..3e977025cf434a93b5044970c73cff149e2ffc7e 100644 (file)
@@ -261,7 +261,7 @@ struct mddev_s
        sector_t                        recovery_cp;
 
        spinlock_t                      write_lock;
-       struct bio_list                 write_list;
+       wait_queue_head_t               sb_wait;        /* for waiting on superblock updates */
 
        unsigned int                    safemode;       /* if set, update "clean" superblock
                                                         * when no writes pending.