md/r5cache: handle alloc_page failure
authorSong Liu <songliubraving@fb.com>
Thu, 24 Nov 2016 06:50:39 +0000 (22:50 -0800)
committerShaohua Li <shli@fb.com>
Mon, 28 Nov 2016 05:35:38 +0000 (21:35 -0800)
RMW of r5c write back cache uses an extra page to store old data for
prexor. handle_stripe_dirtying() allocates this page by calling
alloc_page(). However, alloc_page() may fail.

To handle alloc_page() failures, this patch adds an extra page to
disk_info. When alloc_page fails, handle_stripe() trys to use these
pages. When these pages are used by other stripe (R5C_EXTRA_PAGE_IN_USE),
the stripe is added to delayed_list.

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
drivers/md/raid5-cache.c
drivers/md/raid5.c
drivers/md/raid5.h

index 5f817bdaceb96df624a996d283c09882d4b38c50..5d3d238921e8c3cc3debc4ae925fd78fef80ec37 100644 (file)
@@ -2326,15 +2326,40 @@ int r5c_try_caching_write(struct r5conf *conf,
  */
 void r5c_release_extra_page(struct stripe_head *sh)
 {
+       struct r5conf *conf = sh->raid_conf;
        int i;
+       bool using_disk_info_extra_page;
+
+       using_disk_info_extra_page =
+               sh->dev[0].orig_page == conf->disks[0].extra_page;
 
        for (i = sh->disks; i--; )
                if (sh->dev[i].page != sh->dev[i].orig_page) {
                        struct page *p = sh->dev[i].orig_page;
 
                        sh->dev[i].orig_page = sh->dev[i].page;
-                       put_page(p);
+                       if (!using_disk_info_extra_page)
+                               put_page(p);
                }
+
+       if (using_disk_info_extra_page) {
+               clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
+               md_wakeup_thread(conf->mddev->thread);
+       }
+}
+
+void r5c_use_extra_page(struct stripe_head *sh)
+{
+       struct r5conf *conf = sh->raid_conf;
+       int i;
+       struct r5dev *dev;
+
+       for (i = sh->disks; i--; ) {
+               dev = &sh->dev[i];
+               if (dev->orig_page != dev->page)
+                       put_page(dev->orig_page);
+               dev->orig_page = conf->disks[i].extra_page;
+       }
 }
 
 /*
index dbab8c7eccb00e2b710c59cac105d699c95e3834..db909b9e37dff4056edf21d059273875129c8400 100644 (file)
@@ -876,6 +876,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
        if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
                /* writing out phase */
+               if (s->waiting_extra_page)
+                       return;
                if (r5l_write_stripe(conf->log, sh) == 0)
                        return;
        } else {  /* caching phase */
@@ -2007,6 +2009,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
                INIT_LIST_HEAD(&sh->batch_list);
                INIT_LIST_HEAD(&sh->lru);
                INIT_LIST_HEAD(&sh->r5c);
+               INIT_LIST_HEAD(&sh->log_list);
                atomic_set(&sh->count, 1);
                sh->log_start = MaxSector;
                for (i = 0; i < disks; i++) {
@@ -2253,10 +2256,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         */
        ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
        if (ndisks) {
-               for (i=0; i<conf->raid_disks; i++)
+               for (i = 0; i < conf->pool_size; i++)
                        ndisks[i] = conf->disks[i];
-               kfree(conf->disks);
-               conf->disks = ndisks;
+
+               for (i = conf->pool_size; i < newsize; i++) {
+                       ndisks[i].extra_page = alloc_page(GFP_NOIO);
+                       if (!ndisks[i].extra_page)
+                               err = -ENOMEM;
+               }
+
+               if (err) {
+                       for (i = conf->pool_size; i < newsize; i++)
+                               if (ndisks[i].extra_page)
+                                       put_page(ndisks[i].extra_page);
+                       kfree(ndisks);
+               } else {
+                       kfree(conf->disks);
+                       conf->disks = ndisks;
+               }
        } else
                err = -ENOMEM;
 
@@ -3580,10 +3597,10 @@ unhash:
                break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
-static void handle_stripe_dirtying(struct r5conf *conf,
-                                  struct stripe_head *sh,
-                                  struct stripe_head_state *s,
-                                  int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+                                 struct stripe_head *sh,
+                                 struct stripe_head_state *s,
+                                 int disks)
 {
        int rmw = 0, rcw = 0, i;
        sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3649,12 +3666,32 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                            dev->page == dev->orig_page &&
                            !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
                                /* alloc page for prexor */
-                               dev->orig_page = alloc_page(GFP_NOIO);
+                               struct page *p = alloc_page(GFP_NOIO);
+
+                               if (p) {
+                                       dev->orig_page = p;
+                                       continue;
+                               }
 
-                               /* will handle failure in a later patch*/
-                               BUG_ON(!dev->orig_page);
+                               /*
+                                * alloc_page() failed, try use
+                                * disk_info->extra_page
+                                */
+                               if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+                                                     &conf->cache_state)) {
+                                       r5c_use_extra_page(sh);
+                                       break;
+                               }
+
+                               /* extra_page in use, add to delayed_list */
+                               set_bit(STRIPE_DELAYED, &sh->state);
+                               s->waiting_extra_page = 1;
+                               return -EAGAIN;
                        }
+               }
 
+               for (i = disks; i--; ) {
+                       struct r5dev *dev = &sh->dev[i];
                        if ((dev->towrite ||
                             i == sh->pd_idx || i == sh->qd_idx ||
                             test_bit(R5_InJournal, &dev->flags)) &&
@@ -3730,6 +3767,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
            (s->locked == 0 && (rcw == 0 || rmw == 0) &&
             !test_bit(STRIPE_BIT_DELAY, &sh->state)))
                schedule_reconstruction(sh, s, rcw == 0, 0);
+       return 0;
 }
 
 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -4545,8 +4583,12 @@ static void handle_stripe(struct stripe_head *sh)
                        if (ret == -EAGAIN ||
                            /* stripe under reclaim: !caching && injournal */
                            (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
-                            s.injournal > 0))
-                               handle_stripe_dirtying(conf, sh, &s, disks);
+                            s.injournal > 0)) {
+                               ret = handle_stripe_dirtying(conf, sh, &s,
+                                                            disks);
+                               if (ret == -EAGAIN)
+                                       goto finish;
+                       }
                }
        }
 
@@ -6458,6 +6500,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+       int i;
+
        if (conf->log)
                r5l_exit_log(conf->log);
        if (conf->shrinker.nr_deferred)
@@ -6466,6 +6510,9 @@ static void free_conf(struct r5conf *conf)
        free_thread_groups(conf);
        shrink_stripes(conf);
        raid5_free_percpu(conf);
+       for (i = 0; i < conf->pool_size; i++)
+               if (conf->disks[i].extra_page)
+                       put_page(conf->disks[i].extra_page);
        kfree(conf->disks);
        kfree(conf->stripe_hashtbl);
        kfree(conf);
@@ -6612,9 +6659,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
        conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
                              GFP_KERNEL);
+
        if (!conf->disks)
                goto abort;
 
+       for (i = 0; i < max_disks; i++) {
+               conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+               if (!conf->disks[i].extra_page)
+                       goto abort;
+       }
+
        conf->mddev = mddev;
 
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
index d13fe45d696095852e92bfa421bc9cf1f35bbda0..ed8e1362ab3698e6608aceee90614501bb1a69b2 100644 (file)
@@ -276,6 +276,7 @@ struct stripe_head_state {
        struct md_rdev *blocked_rdev;
        int handle_bad_blocks;
        int log_failed;
+       int waiting_extra_page;
 };
 
 /* Flags for struct r5dev.flags */
@@ -439,6 +440,7 @@ enum {
 
 struct disk_info {
        struct md_rdev  *rdev, *replacement;
+       struct page     *extra_page; /* extra page to use in prexor */
 };
 
 /*
@@ -559,6 +561,9 @@ enum r5_cache_state {
                                 * only process stripes that are already
                                 * occupying the log
                                 */
+       R5C_EXTRA_PAGE_IN_USE,  /* a stripe is using disk_info.extra_page
+                                * for prexor
+                                */
 };
 
 struct r5conf {
@@ -765,6 +770,7 @@ extern void
 r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
                            struct stripe_head_state *s);
 extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
        struct stripe_head *sh, int disks, struct bio_list *return_bi);