bcache: Fix a writeback performance regression
authorKent Overstreet <kmo@daterainc.com>
Tue, 24 Sep 2013 06:17:31 +0000 (23:17 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 5 Oct 2013 14:13:09 +0000 (07:13 -0700)
commit c2a4f3183a1248f615a695fbd8905da55ad11bba upstream.

Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.

When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO.  Doh.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/md/bcache/bcache.h
drivers/md/bcache/util.c
drivers/md/bcache/util.h
drivers/md/bcache/writeback.c

index c42b14b2304cceb0c5dde84660f63de6b57daca2..6bc016eabdc96f4a2dc091412b1e0d765bf8558a 100644 (file)
@@ -499,7 +499,7 @@ struct cached_dev {
         */
        atomic_t                has_dirty;
 
-       struct ratelimit        writeback_rate;
+       struct bch_ratelimit    writeback_rate;
        struct delayed_work     writeback_rate_update;
 
        /*
@@ -508,10 +508,9 @@ struct cached_dev {
         */
        sector_t                last_read;
 
-       /* Number of writeback bios in flight */
-       atomic_t                in_flight;
+       /* Limit number of writeback bios in flight */
+       struct semaphore        in_flight;
        struct closure_with_timer writeback;
-       struct closure_waitlist writeback_wait;
 
        struct keybuf           writeback_keys;
 
index da3a99e85b1e0db79a35a2735769b7595f2be159..38a43f8d36eca663b9d706a96ef43ddc8e2d698c 100644 (file)
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
        stats->last = now ?: 1;
 }
 
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
        uint64_t now = local_clock();
 
index 577393e38c3ac3c05076d7a24ce0707a946a0402..43fd78affcea55eac050c36dcd7e8ca6675f6eef 100644 (file)
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units)
        (ewma) >> factor;                                               \
 })
 
-struct ratelimit {
+struct bch_ratelimit {
+       /* Next time we want to do some work, in nanoseconds */
        uint64_t                next;
+
+       /*
+        * Rate at which we want to do work, in units per nanosecond
+        * The units here correspond to the units passed to bch_next_delay()
+        */
        unsigned                rate;
 };
 
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
 {
        d->next = local_clock();
 }
 
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
 
 #define __DIV_SAFE(n, d, zero)                                         \
 ({                                                                     \
index 2714ed3991d1b747518aeb22d70222d1653d5e40..12c0cd135ef850c93a0a51a89f325842195ec434 100644 (file)
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work)
 
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
+       uint64_t ret;
+
        if (atomic_read(&dc->disk.detaching) ||
            !dc->writeback_percent)
                return 0;
 
-       return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+       ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+       return min_t(uint64_t, ret, HZ);
 }
 
 /* Background writeback */
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl)
 
        up_write(&dc->writeback_lock);
 
-       ratelimit_reset(&dc->writeback_rate);
+       bch_ratelimit_reset(&dc->writeback_rate);
 
        /* Punt to workqueue only so we don't recurse and blow the stack */
        continue_at(cl, read_dirty, dirty_wq);
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl)
        }
 
        bch_keybuf_del(&dc->writeback_keys, w);
-       atomic_dec_bug(&dc->in_flight);
-
-       closure_wake_up(&dc->writeback_wait);
+       up(&dc->in_flight);
 
        closure_return_with_destructor(cl, dirty_io_destructor);
 }
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl)
        trace_bcache_write_dirty(&io->bio);
        closure_bio_submit(&io->bio, cl, &io->dc->disk);
 
-       continue_at(cl, write_dirty_finish, dirty_wq);
+       continue_at(cl, write_dirty_finish, system_wq);
 }
 
 static void read_dirty_endio(struct bio *bio, int error)
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl)
        trace_bcache_read_dirty(&io->bio);
        closure_bio_submit(&io->bio, cl, &io->dc->disk);
 
-       continue_at(cl, write_dirty, dirty_wq);
+       continue_at(cl, write_dirty, system_wq);
 }
 
 static void read_dirty(struct closure *cl)
@@ -324,12 +326,9 @@ static void read_dirty(struct closure *cl)
 
                if (delay > 0 &&
                    (KEY_START(&w->key) != dc->last_read ||
-                    jiffies_to_msecs(delay) > 50)) {
-                       w->private = NULL;
-
-                       closure_delay(&dc->writeback, delay);
-                       continue_at(cl, read_dirty, dirty_wq);
-               }
+                    jiffies_to_msecs(delay) > 50))
+                       while (delay)
+                               delay = schedule_timeout(delay);
 
                dc->last_read   = KEY_OFFSET(&w->key);
 
@@ -354,15 +353,10 @@ static void read_dirty(struct closure *cl)
 
                pr_debug("%s", pkey(&w->key));
 
-               closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+               down(&dc->in_flight);
+               closure_call(&io->cl, read_dirty_submit, NULL, cl);
 
                delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
-               atomic_inc(&dc->in_flight);
-
-               if (!closure_wait_event(&dc->writeback_wait, cl,
-                                       atomic_read(&dc->in_flight) < 64))
-                       continue_at(cl, read_dirty, dirty_wq);
        }
 
        if (0) {
@@ -372,11 +366,16 @@ err:
                bch_keybuf_del(&dc->writeback_keys, w);
        }
 
-       refill_dirty(cl);
+       /*
+        * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+        * freed) before refilling again
+        */
+       continue_at(cl, refill_dirty, dirty_wq);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
+       sema_init(&dc->in_flight, 64);
        closure_init_unlocked(&dc->writeback);
        init_rwsem(&dc->writeback_lock);
 
@@ -406,7 +405,7 @@ void bch_writeback_exit(void)
 
 int __init bch_writeback_init(void)
 {
-       dirty_wq = create_singlethread_workqueue("bcache_writeback");
+       dirty_wq = create_workqueue("bcache_writeback");
        if (!dirty_wq)
                return -ENOMEM;