[PATCH] md: Allow stripes to be expanded in preparation for expanding an array
authorNeilBrown <neilb@suse.de>
Mon, 27 Mar 2006 09:18:07 +0000 (01:18 -0800)
committerLinus Torvalds <torvalds@g5.osdl.org>
Mon, 27 Mar 2006 16:45:01 +0000 (08:45 -0800)
Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache
data structure.

This requires allocating new stripes in a new kmem_cache.  If this succeeds,
we copy cache pages over and release the old stripes and kmem_cache.

We then allocate new pages.  If that fails, we leave the stripe cache at it's
new size.  It isn't worth the effort to shrink it back again.

Unfortuanately this means we need two kmem_cache names as we, for a short
period of time, we have two kmem_caches.  So they are raid5/%s and
raid5/%s-alt

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
drivers/md/md.c
drivers/md/raid5.c
drivers/md/raid6main.c
include/linux/raid/raid5.h

index a3ecaf8ed30a5ee6207acac842148de8503a290d..c7b7656f9aa5cd4e98ad9999b32d8ddb44199838 100644 (file)
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-       struct list_head candidates;
        struct list_head *tmp;
        mdk_rdev_t *rdev0, *rdev;
        mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
        printk(KERN_INFO "md: autorun ...\n");
        while (!list_empty(&pending_raid_disks)) {
                dev_t dev;
+               LIST_HEAD(candidates);
                rdev0 = list_entry(pending_raid_disks.next,
                                         mdk_rdev_t, same_set);
 
index 03f31379cebb0cb965ffce503467e1fae9234157..6c20b44509d875bb1fd5557d4b5670c829ac52a3 100644 (file)
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        kmem_cache_t *sc;
        int devs = conf->raid_disks;
 
-       sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
-       sc = kmem_cache_create(conf->cache_name, 
+       sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+       sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+       conf->active_name = 0;
+       sc = kmem_cache_create(conf->cache_name[conf->active_name],
                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
                               0, 0, NULL, NULL);
        if (!sc)
                return 1;
        conf->slab_cache = sc;
+       conf->pool_size = devs;
        while (num--) {
                if (!grow_one_stripe(conf))
                        return 1;
        }
        return 0;
 }
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+       /* Make all the stripes able to hold 'newsize' devices.
+        * New slots in each stripe get 'page' set to a new page.
+        *
+        * This happens in stages:
+        * 1/ create a new kmem_cache and allocate the required number of
+        *    stripe_heads.
+        * 2/ gather all the old stripe_heads and tranfer the pages across
+        *    to the new stripe_heads.  This will have the side effect of
+        *    freezing the array as once all stripe_heads have been collected,
+        *    no IO will be possible.  Old stripe heads are freed once their
+        *    pages have been transferred over, and the old kmem_cache is
+        *    freed when all stripes are done.
+        * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
+        *    we simple return a failre status - no need to clean anything up.
+        * 4/ allocate new pages for the new slots in the new stripe_heads.
+        *    If this fails, we don't bother trying the shrink the
+        *    stripe_heads down again, we just leave them as they are.
+        *    As each stripe_head is processed the new one is released into
+        *    active service.
+        *
+        * Once step2 is started, we cannot afford to wait for a write,
+        * so we use GFP_NOIO allocations.
+        */
+       struct stripe_head *osh, *nsh;
+       LIST_HEAD(newstripes);
+       struct disk_info *ndisks;
+       int err = 0;
+       kmem_cache_t *sc;
+       int i;
+
+       if (newsize <= conf->pool_size)
+               return 0; /* never bother to shrink */
+
+       /* Step 1 */
+       sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+                              sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+                              0, 0, NULL, NULL);
+       if (!sc)
+               return -ENOMEM;
+
+       for (i = conf->max_nr_stripes; i; i--) {
+               nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+               if (!nsh)
+                       break;
+
+               memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+               nsh->raid_conf = conf;
+               spin_lock_init(&nsh->lock);
+
+               list_add(&nsh->lru, &newstripes);
+       }
+       if (i) {
+               /* didn't get enough, give up */
+               while (!list_empty(&newstripes)) {
+                       nsh = list_entry(newstripes.next, struct stripe_head, lru);
+                       list_del(&nsh->lru);
+                       kmem_cache_free(sc, nsh);
+               }
+               kmem_cache_destroy(sc);
+               return -ENOMEM;
+       }
+       /* Step 2 - Must use GFP_NOIO now.
+        * OK, we have enough stripes, start collecting inactive
+        * stripes and copying them over
+        */
+       list_for_each_entry(nsh, &newstripes, lru) {
+               spin_lock_irq(&conf->device_lock);
+               wait_event_lock_irq(conf->wait_for_stripe,
+                                   !list_empty(&conf->inactive_list),
+                                   conf->device_lock,
+                                   unplug_slaves(conf->mddev);
+                       );
+               osh = get_free_stripe(conf);
+               spin_unlock_irq(&conf->device_lock);
+               atomic_set(&nsh->count, 1);
+               for(i=0; i<conf->pool_size; i++)
+                       nsh->dev[i].page = osh->dev[i].page;
+               for( ; i<newsize; i++)
+                       nsh->dev[i].page = NULL;
+               kmem_cache_free(conf->slab_cache, osh);
+       }
+       kmem_cache_destroy(conf->slab_cache);
+
+       /* Step 3.
+        * At this point, we are holding all the stripes so the array
+        * is completely stalled, so now is a good time to resize
+        * conf->disks.
+        */
+       ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+       if (ndisks) {
+               for (i=0; i<conf->raid_disks; i++)
+                       ndisks[i] = conf->disks[i];
+               kfree(conf->disks);
+               conf->disks = ndisks;
+       } else
+               err = -ENOMEM;
+
+       /* Step 4, return new stripes to service */
+       while(!list_empty(&newstripes)) {
+               nsh = list_entry(newstripes.next, struct stripe_head, lru);
+               list_del_init(&nsh->lru);
+               for (i=conf->raid_disks; i < newsize; i++)
+                       if (nsh->dev[i].page == NULL) {
+                               struct page *p = alloc_page(GFP_NOIO);
+                               nsh->dev[i].page = p;
+                               if (!p)
+                                       err = -ENOMEM;
+                       }
+               release_stripe(nsh);
+       }
+       /* critical section pass, GFP_NOIO no longer needed */
+
+       conf->slab_cache = sc;
+       conf->active_name = 1-conf->active_name;
+       conf->pool_size = newsize;
+       return err;
+}
+
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
                return 0;
        if (atomic_read(&sh->count))
                BUG();
-       shrink_buffers(sh, conf->raid_disks);
+       shrink_buffers(sh, conf->pool_size);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
        return 1;
index c7632f6cc48718ceeea0f17d65325f4ca241fb73..6df4930fddecae952d7e5e07c165a97877d12825 100644 (file)
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
        kmem_cache_t *sc;
        int devs = conf->raid_disks;
 
-       sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+       sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
 
-       sc = kmem_cache_create(conf->cache_name,
+       sc = kmem_cache_create(conf->cache_name[0],
                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
                               0, 0, NULL, NULL);
        if (!sc)
index 94dbdd406f1210e75cd4544f276a81d2490c1c63..b7b2653af7bb06e898a728266810b7c216ee836b 100644 (file)
@@ -216,7 +216,11 @@ struct raid5_private_data {
        struct list_head        bitmap_list; /* stripes delaying awaiting bitmap update */
        atomic_t                preread_active_stripes; /* stripes with scheduled io */
 
-       char                    cache_name[20];
+       /* unfortunately we need two cache names as we temporarily have
+        * two caches.
+        */
+       int                     active_name;
+       char                    cache_name[2][20];
        kmem_cache_t            *slab_cache; /* for allocating stripes */
 
        int                     seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
        wait_queue_head_t       wait_for_overlap;
        int                     inactive_blocked;       /* release of inactive stripes blocked,
                                                         * waiting for 25% to be free
-                                                        */        
+                                                        */
+       int                     pool_size; /* number of disks in stripeheads in pool */
        spinlock_t              device_lock;
        struct disk_info        *disks;
 };