From a8c34f915976e3de044cc31b8bcb46f816f5a52e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 2 Sep 2015 13:49:46 -0700 Subject: [PATCH] raid5-cache: switching to state machine for log disk cache flush Before we write stripe data to raid disks, we must guarantee stripe data is settled down in log disk. To do this, we flush log disk cache and wait the flush finish. That wait introduces sleep time in raid5d thread and impact performance. This patch moves the log disk cache flush process to the stripe handling state machine, which can remove the wait in raid5d. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 116 ++++++++++++++++++++++----------------- drivers/md/raid5.c | 7 ++- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 270ee3aaba23..41542ebd813b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -61,6 +61,10 @@ struct r5l_log { struct list_head io_end_ios; /* io_units which have been completely * written to the log but not yet written * to the RAID */ + struct list_head flushing_ios; /* io_units which are waiting for log + * cache flush */ + struct list_head flushed_ios; /* io_units which settle down in log disk */ + struct bio flush_bio; struct list_head stripe_end_ios;/* io_units which have been completely * written to the RAID but have not yet * been considered for updating super */ @@ -114,8 +118,7 @@ enum r5l_io_unit_state { IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, * don't accepting new bio */ IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ - IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */ - IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */ + IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ }; static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) @@ -229,7 +232,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, struct r5l_io_unit *last; sector_t reclaimable_space; - r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios, + r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios, IO_UNIT_STRIPE_END); last = list_last_entry(&log->stripe_end_ios, @@ -559,6 +562,28 @@ void r5l_stripe_write_finished(struct stripe_head *sh) r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); } +static void r5l_log_flush_endio(struct bio *bio) +{ + struct r5l_log *log = container_of(bio, struct r5l_log, + flush_bio); + unsigned long flags; + struct r5l_io_unit *io; + struct stripe_head *sh; + + spin_lock_irqsave(&log->io_list_lock, flags); + list_for_each_entry(io, &log->flushing_ios, log_sibling) { + while (!list_empty(&io->stripe_list)) { + sh = list_first_entry(&io->stripe_list, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + } + list_splice_tail_init(&log->flushing_ios, &log->flushed_ios); + spin_unlock_irqrestore(&log->io_list_lock, flags); +} + /* * Starting dispatch IO to raid. * io_unit(meta) consists of a log. There is one situation we want to avoid. A @@ -575,44 +600,31 @@ void r5l_stripe_write_finished(struct stripe_head *sh) */ void r5l_flush_stripe_to_raid(struct r5l_log *log) { - struct r5l_io_unit *io; - struct stripe_head *sh; - bool run_stripe; - + bool do_flush; if (!log) return; - spin_lock_irq(&log->io_list_lock); - run_stripe = !list_empty(&log->io_end_ios); - spin_unlock_irq(&log->io_list_lock); - - if (!run_stripe) - return; - - blkdev_issue_flush(log->rdev->bdev, GFP_NOIO, NULL); spin_lock_irq(&log->io_list_lock); - list_for_each_entry(io, &log->io_end_ios, log_sibling) { - if (io->state >= IO_UNIT_STRIPE_START) - continue; - __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START); - - while (!list_empty(&io->stripe_list)) { - sh = list_first_entry(&io->stripe_list, - struct stripe_head, log_list); - list_del_init(&sh->log_list); - set_bit(STRIPE_HANDLE, &sh->state); - raid5_release_stripe(sh); - } + /* flush bio is running */ + if (!list_empty(&log->flushing_ios)) { + spin_unlock_irq(&log->io_list_lock); + return; } + list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); + do_flush = !list_empty(&log->flushing_ios); spin_unlock_irq(&log->io_list_lock); + + if (!do_flush) + return; + bio_reset(&log->flush_bio); + log->flush_bio.bi_bdev = log->rdev->bdev; + log->flush_bio.bi_end_io = r5l_log_flush_endio; + submit_bio(WRITE_FLUSH, &log->flush_bio); } static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io) { - /* the log thread will write the io unit */ - wait_event(io->wait_state, io->state >= IO_UNIT_IO_END); - if (io->state < IO_UNIT_STRIPE_START) - r5l_flush_stripe_to_raid(log); + md_wakeup_thread(log->rdev->mddev->thread); wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END); } @@ -631,6 +643,8 @@ static void r5l_do_reclaim(struct r5l_log *log) * shouldn't reuse space of an unreclaimable io_unit */ while (1) { + struct list_head *target_list = NULL; + while (!list_empty(&log->stripe_end_ios)) { io = list_first_entry(&log->stripe_end_ios, struct r5l_io_unit, log_sibling); @@ -642,29 +656,26 @@ static void r5l_do_reclaim(struct r5l_log *log) if (free >= reclaim_target || (list_empty(&log->running_ios) && list_empty(&log->io_end_ios) && - list_empty(&log->stripe_end_ios))) + list_empty(&log->flushing_ios) && + list_empty(&log->flushed_ios))) break; /* Below waiting mostly happens when we shutdown the raid */ - if (!list_empty(&log->io_end_ios)) { - io = list_first_entry(&log->io_end_ios, - struct r5l_io_unit, log_sibling); - spin_unlock_irq(&log->io_list_lock); - /* nobody else can delete the io, we are safe */ - r5l_kick_io_unit(log, io); - spin_lock_irq(&log->io_list_lock); - continue; - } - - if (!list_empty(&log->running_ios)) { - io = list_first_entry(&log->running_ios, - struct r5l_io_unit, log_sibling); - spin_unlock_irq(&log->io_list_lock); - /* nobody else can delete the io, we are safe */ - r5l_kick_io_unit(log, io); - spin_lock_irq(&log->io_list_lock); - continue; - } + if (!list_empty(&log->flushed_ios)) + target_list = &log->flushed_ios; + else if (!list_empty(&log->flushing_ios)) + target_list = &log->flushing_ios; + else if (!list_empty(&log->io_end_ios)) + target_list = &log->io_end_ios; + else if (!list_empty(&log->running_ios)) + target_list = &log->running_ios; + + io = list_first_entry(target_list, + struct r5l_io_unit, log_sibling); + spin_unlock_irq(&log->io_list_lock); + /* nobody else can delete the io, we are safe */ + r5l_kick_io_unit(log, io); + spin_lock_irq(&log->io_list_lock); } spin_unlock_irq(&log->io_list_lock); @@ -1056,6 +1067,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) INIT_LIST_HEAD(&log->running_ios); INIT_LIST_HEAD(&log->io_end_ios); INIT_LIST_HEAD(&log->stripe_end_ios); + INIT_LIST_HEAD(&log->flushing_ios); + INIT_LIST_HEAD(&log->flushed_ios); + bio_init(&log->flush_bio); log->io_kc = KMEM_CACHE(r5l_io_unit, 0); if (!log->io_kc) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b0bf81d084fd..46042c7c25a5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5740,8 +5740,12 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) if (!list_empty(temp_inactive_list + i)) break; - if (i == NR_STRIPE_HASH_LOCKS) + if (i == NR_STRIPE_HASH_LOCKS) { + spin_unlock_irq(&conf->device_lock); + r5l_flush_stripe_to_raid(conf->log); + spin_lock_irq(&conf->device_lock); return batch_size; + } release_inactive = true; } spin_unlock_irq(&conf->device_lock); @@ -5749,6 +5753,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, release_inactive_stripe_list(conf, temp_inactive_list, NR_STRIPE_HASH_LOCKS); + r5l_flush_stripe_to_raid(conf->log); if (release_inactive) { spin_lock_irq(&conf->device_lock); return 0; -- 2.20.1