From 824e47daddbfc6ebe1006b8659f080620472a136 Mon Sep 17 00:00:00 2001 From: "colyli@suse.de" Date: Sat, 18 Feb 2017 03:05:57 +0800 Subject: [PATCH] RAID1: avoid unnecessary spin locks in I/O barrier code When I run a parallel reading performan testing on a md raid1 device with two NVMe SSDs, I observe very bad throughput in supprise: by fio with 64KB block size, 40 seq read I/O jobs, 128 iodepth, overall throughput is only 2.7GB/s, this is around 50% of the idea performance number. The perf reports locking contention happens at allow_barrier() and wait_barrier() code, - 41.41% fio [kernel.kallsyms] [k] _raw_spin_lock_irqsave - _raw_spin_lock_irqsave + 89.92% allow_barrier + 9.34% __wake_up - 37.30% fio [kernel.kallsyms] [k] _raw_spin_lock_irq - _raw_spin_lock_irq - 100.00% wait_barrier The reason is, in these I/O barrier related functions, - raise_barrier() - lower_barrier() - wait_barrier() - allow_barrier() They always hold conf->resync_lock firstly, even there are only regular reading I/Os and no resync I/O at all. This is a huge performance penalty. The solution is a lockless-like algorithm in I/O barrier code, and only holding conf->resync_lock when it has to. The original idea is from Hannes Reinecke, and Neil Brown provides comments to improve it. I continue to work on it, and make the patch into current form. In the new simpler raid1 I/O barrier implementation, there are two wait barrier functions, - wait_barrier() Which calls _wait_barrier(), is used for regular write I/O. If there is resync I/O happening on the same I/O barrier bucket, or the whole array is frozen, task will wait until no barrier on same barrier bucket, or the whold array is unfreezed. - wait_read_barrier() Since regular read I/O won't interfere with resync I/O (read_balance() will make sure only uptodate data will be read out), it is unnecessary to wait for barrier in regular read I/Os, waiting in only necessary when the whole array is frozen. The operations on conf->nr_pending[idx], conf->nr_waiting[idx], conf-> barrier[idx] are very carefully designed in raise_barrier(), lower_barrier(), _wait_barrier() and wait_read_barrier(), in order to avoid unnecessary spin locks in these functions. Once conf-> nr_pengding[idx] is increased, a resync I/O with same barrier bucket index has to wait in raise_barrier(). Then in _wait_barrier() if no barrier raised in same barrier bucket index and array is not frozen, the regular I/O doesn't need to hold conf->resync_lock, it can just increase conf->nr_pending[idx], and return to its caller. wait_read_barrier() is very similar to _wait_barrier(), the only difference is it only waits when array is frozen. For heavy parallel reading I/Os, the lockless I/O barrier code almostly gets rid of all spin lock cost. This patch significantly improves raid1 reading peroformance. From my testing, a raid1 device built by two NVMe SSD, runs fio with 64KB blocksize, 40 seq read I/O jobs, 128 iodepth, overall throughput increases from 2.7GB/s to 4.6GB/s (+70%). Changelog V4: - Change conf->nr_queued[] to atomic_t. - Define BARRIER_BUCKETS_NR_BITS by (PAGE_SHIFT - ilog2(sizeof(atomic_t))) V3: - Add smp_mb__after_atomic() as Shaohua and Neil suggested. - Change conf->nr_queued[] from atomic_t to int. - Change conf->array_frozen from atomic_t back to int, and use READ_ONCE(conf->array_frozen) to check value of conf->array_frozen in _wait_barrier() and wait_read_barrier(). - In _wait_barrier() and wait_read_barrier(), add a call to wake_up(&conf->wait_barrier) after atomic_dec(&conf->nr_pending[idx]), to fix a deadlock between _wait_barrier()/wait_read_barrier and freeze_array(). V2: - Remove a spin_lock/unlock pair in raid1d(). - Add more code comments to explain why there is no racy when checking two atomic_t variables at same time. V1: - Original RFC patch for comments. Signed-off-by: Coly Li Cc: Shaohua Li Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Guoqing Jiang Reviewed-by: Neil Brown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 165 +++++++++++++++++++++++++++++++-------------- drivers/md/raid1.h | 31 ++++----- 2 files changed, 130 insertions(+), 66 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40297fd17f7e..fefbbfdb440b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -226,7 +226,7 @@ static void reschedule_retry(struct r1bio *r1_bio) idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued[idx]++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -836,11 +836,21 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx], + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier[idx]++; + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); /* For these conditions we must wait: * A: while the array is in frozen state @@ -851,42 +861,81 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - !conf->nr_pending[idx] && - conf->barrier[idx] < RESYNC_DEPTH, + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending[idx]++; + atomic_inc(&conf->nr_pending[idx]); spin_unlock_irq(&conf->resync_lock); } static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; int idx = sector_to_idx(sector_nr); - BUG_ON(conf->barrier[idx] <= 0); + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier[idx]--; - conf->nr_pending[idx]--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } static void _wait_barrier(struct r1conf *conf, int idx) { - spin_lock_irq(&conf->resync_lock); - if (conf->array_frozen || conf->barrier[idx]) { - conf->nr_waiting[idx]++; - /* Wait for the barrier to drop. */ - wait_event_lock_irq( - conf->wait_barrier, - !conf->array_frozen && !conf->barrier[idx], - conf->resync_lock); - conf->nr_waiting[idx]--; - } + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); + + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; - conf->nr_pending[idx]++; + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); } @@ -894,18 +943,32 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { int idx = sector_to_idx(sector_nr); - spin_lock_irq(&conf->resync_lock); - if (conf->array_frozen) { - conf->nr_waiting[idx]++; - /* Wait for array to unfreeze */ - wait_event_lock_irq( - conf->wait_barrier, - !conf->array_frozen, - conf->resync_lock); - conf->nr_waiting[idx]--; - } + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); - conf->nr_pending[idx]++; + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); } @@ -926,11 +989,7 @@ static void wait_all_barriers(struct r1conf *conf) static void _allow_barrier(struct r1conf *conf, int idx) { - unsigned long flags; - - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending[idx]--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } @@ -955,7 +1014,8 @@ static int get_unqueued_pending(struct r1conf *conf) int idx, ret; for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - ret += conf->nr_pending[idx] - conf->nr_queued[idx]; + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); return ret; } @@ -1000,8 +1060,8 @@ static void unfreeze_array(struct r1conf *conf) /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 0; - wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); } /* duplicate the data pages for behind I/O @@ -2391,8 +2451,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); idx = sector_to_idx(r1_bio->sector); - conf->nr_queued[idx]++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * get_unqueued_pending() == extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2523,9 +2588,7 @@ static void raid1d(struct md_thread *thread) retry_list); list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); - spin_lock_irqsave(&conf->device_lock, flags); - conf->nr_queued[idx]--; - spin_unlock_irqrestore(&conf->device_lock, flags); + atomic_dec(&conf->nr_queued[idx]); if (mddev->degraded) set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2547,7 +2610,7 @@ static void raid1d(struct md_thread *thread) r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); idx = sector_to_idx(r1_bio->sector); - conf->nr_queued[idx]--; + atomic_dec(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -2664,7 +2727,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting[idx]) + if (atomic_read(&conf->nr_waiting[idx])) schedule_timeout_uninterruptible(1); /* we are incrementing sector_nr below. To be safe, we check against @@ -2924,22 +2987,22 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_pending) goto abort; conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_waiting) goto abort; conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->nr_queued) goto abort; conf->barrier = kcalloc(BARRIER_BUCKETS_NR, - sizeof(int), GFP_KERNEL); + sizeof(atomic_t), GFP_KERNEL); if (!conf->barrier) goto abort; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 3442e8fe3fcd..dd22a37d0d83 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -10,18 +10,19 @@ /* * In struct r1conf, the following members are related to I/O barrier * buckets, - * int *nr_pending; - * int *nr_waiting; - * int *nr_queued; - * int *barrier; - * Each of them points to array of integers, each array is designed to - * have BARRIER_BUCKETS_NR elements and occupy a single memory page. The - * data width of integer variables is 4, equal to 1<<(ilog2(sizeof(int))), - * BARRIER_BUCKETS_NR_BITS is defined as (PAGE_SHIFT - ilog2(sizeof(int))) - * to make sure an array of integers with BARRIER_BUCKETS_NR elements just - * exactly occupies a single memory page. + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. */ -#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(int))) +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) #define BARRIER_BUCKETS_NR (1<